From 7a4343df61e6094ae5a2b2eda36c0707d7c1fd2d Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 27 Dec 2023 11:19:46 +0800
Subject: [PATCH 01/90] first update for migration

---
 CMakeLists.txt |    76 +-
 ggml-sycl.cpp  | 12393 +++++++++++++++++++++++++++++++++++++++++++++++
 ggml-sycl.hpp  |     4 +
 ggml.h         |     2 +-
 4 files changed, 12461 insertions(+), 14 deletions(-)
 create mode 100644 ggml-sycl.cpp
 create mode 100644 ggml-sycl.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5a333ff524b65..51089c3b5b742 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,6 @@
 cmake_minimum_required(VERSION 3.14)  # for add_link_options and implicit target directories.
 project("llama.cpp" C CXX)
+include(CheckIncludeFileCXX)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
@@ -96,11 +97,11 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                              "llama: max. batch size for using peer access")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
-option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
+option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 
@@ -122,7 +123,7 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 # Compile flags
 #
 
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@@ -338,18 +339,11 @@ if (LLAMA_CUBLAS)
         add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
 
         if (LLAMA_STATIC)
-            if (WIN32)
-                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
-            else ()
-                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
-            endif()
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
         else()
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
-
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
         # 52 == lowest CUDA 12 standard
         # 60 == f16 CUDA intrinsics
@@ -426,9 +420,6 @@ if (LLAMA_HIPBLAS)
     if (${hipblas_FOUND} AND ${hip_FOUND})
         message(STATUS "HIP and hipBLAS found")
         add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
-        if (LLAMA_HIP_UMA)
-            add_compile_definitions(GGML_HIP_UMA)
-        endif()
         add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
         if (BUILD_SHARED_LIBS)
             set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -454,6 +445,64 @@ if (LLAMA_HIPBLAS)
     endif()
 endif()
 
+
+if (LLAMA_SYCL)
+    set(ENABLE_AOT ats)
+    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for SYCL")
+    endif()
+    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+        message(WARNING "Only LLVM is supported for SYCL")
+    endif()
+
+    #find_package(SYCL REQUIRED)
+    find_package(IntelSYCL REQUIRED)
+
+    # Check SYCL support by the compiler
+    check_cxx_compiler_flag("-fsycl" _fsycl_option)
+    if (_fsycl_option)
+	    #set (CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} "/opt/intel/oneapi/compiler/2024.0/include")
+        CHECK_INCLUDE_FILE_CXX("sycl/sycl.hpp" _sycl_header "-fsycl")
+	    set (_sycl_header "/opt/intel/oneapi/compiler/2024.0/include/sycl/sycl.hpp")
+        if (NOT _sycl_header)
+            CHECK_INCLUDE_FILE_CXX("CL/sycl.hpp" _sycl_header_old "-fsycl")
+        endif()
+        if (_sycl_header OR _sycl_header_old)
+            set(_sycl_support TRUE)
+        endif()
+    endif()
+
+    if (_sycl_support)
+	    add_compile_definitions(GGML_USE_CUBLAS)
+	    #add_compile_options(-std=c++17 -O3 -fsycl)
+        add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include)
+        add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include/sycl)
+        add_compile_options(-I/opt/intel/oneapi/dpcpp-ct/2024.0/include)
+        add_compile_options(-I/opt/intel/oneapi/2024.0/include)
+
+	    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
+
+        set(GGML_HEADERS_SYCL ggml-cuda.h ggml.h ggml-sycl.hpp)
+        set(GGML_SOURCES_SYCL ggml-sycl.cpp)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl  mkl_sycl_blas mkl_sycl_lapack mkl_sycl_dft mkl_sycl_sparse mkl_sycl_vm mkl_sycl_rng mkl_sycl_stats mkl_sycl_data_fitting mkl_intel_ilp64 mkl_tbb_thread)
+
+        #add_library(ggml-sycl OBJECT ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL})
+        #add_executable(${PROJECT_NAME} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL})
+        #target_link_libraries(ggml-sycl PRIVATE sycl)
+        #target_compile_options(${PROJECT_NAME} PRIVATE ${CMAKE_CXX_FLAGS})
+        #set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl)
+	#add_sycl_to_target({})
+
+    else()
+        message(FATAL_ERROR "SYCL Support is not present")
+    endif()
+endif()
+
+
+
 function(get_flags CCID CCVER)
     set(C_FLAGS "")
     set(CXX_FLAGS "")
@@ -790,6 +839,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
             ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
             ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
+            ${GGML_SOURCES_SYCL}  ${GGML_HEADERS_SYCL}
             )
 
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
new file mode 100644
index 0000000000000..160cdf63a502f
--- /dev/null
+++ b/ggml-sycl.cpp
@@ -0,0 +1,12393 @@
+#define DPCT_PROFILING_ENABLED
+#define DPCT_COMPAT_RT_VERSION 12010
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <algorithm>
+#include <assert.h>
+#include <atomic>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <float.h>
+#include <limits>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include <dpct/blas_utils.hpp>
+
+#if defined(GGML_USE_HIPBLAS)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#ifdef GGML_HIP_UMA
+#define cudaMalloc hipMallocManaged
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
+#else
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#endif
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define __trap abort
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#else
+
+#if DPCT_COMPAT_RT_VERSION < 11020
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
+
+#endif // defined(GGML_USE_HIPBLAS)
+
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+#include <cmath>
+
+#include <dpct/lib_common_utils.hpp>
+
+#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define CC_VOLTA      700
+#define CC_OFFSET_AMD 1000000
+#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
+
+#define GGML_CUDA_MAX_NODES 8192
+
+// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
+// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
+// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
+// -  7B quantum model: +100-200 MB
+// - 13B quantum model: +200-400 MB
+//
+//#define GGML_CUDA_FORCE_MMQ
+
+// TODO: improve this to be correct for more hardware
+//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
+//       probably other such cases, and not sure what happens on AMD hardware
+#if !defined(GGML_CUDA_FORCE_MMQ)
+#define CUDA_USE_TENSOR_CORES
+#endif
+
+// max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 32
+
+#if defined(GGML_USE_HIPBLAS)
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int&>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int&>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+    c = __builtin_amdgcn_sdot4(a, b, c, false);
+#elif defined(__gfx1100__)
+    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
+#elif defined(__gfx1010__) || defined(__gfx900__)
+    int tmp1;
+    int tmp2;
+    asm("\n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        "
+        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
+        : "v"(a), "v"(b)
+    );
+#else
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+    return c;
+}
+#endif // defined(GGML_USE_HIPBLAS)
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+#if DPCT_COMPAT_RT_VERSION >= 12000
+    static const char *cublas_get_error_str(const int err) {
+        /*
+        DPCT1009:63: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        return "cublasGetStatusString is not supported" /*cublasGetStatusString(err)*/
+            ;
+    }
+#else
+    static const char * cublas_get_error_str(const cublasStatus_t err) {
+        switch (err) {
+            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+            default: return "unknown error";
+        }
+    }
+#endif // CUDART_VERSION >= 12000
+
+[[noreturn]]
+static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
+    fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);
+    fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
+    GGML_ASSERT(!"CUDA error");
+}
+
+/*
+DPCT1001:65: The statement could not be removed.
+*/
+/*
+DPCT1000:66: Error handling if-stmt was detected but could not be rewritten.
+*/
+/*
+DPCT1009:67: SYCL uses exceptions to report errors and does not use the error
+codes. The original code was commented out and a warning string was inserted.
+You need to rewrite this code.
+*/
+#define CUDA_CHECK(err) do {                                                   \
+    auto err_ = (err); if (err_ != 0) ggml_cuda_error(                         \
+        #err, __func__, __FILE__, __LINE__,                                    \
+        "cudaGetErrorString is not supported" /*cudaGetErrorString(err_)*/);   \
+} while (0)
+#define CUBLAS_CHECK(err)                                                      \
+    do { auto err_ = (err); if (err_ != 0)                                     \
+             ggml_cuda_error(#err, __func__, __FILE__, __LINE__,               \
+                             cublas_get_error_str(err_)); } while (0)
+
+#if !defined(GGML_USE_HIPBLAS)
+static const char *cu_get_error_str(int err) {
+    const char * err_str;
+    /*
+    DPCT1007:64: Migration of cuGetErrorString is not supported.
+    */
+    cuGetErrorString(err, &err_str);
+    return err_str;
+}
+/*
+DPCT1001:82: The statement could not be removed.
+*/
+/*
+DPCT1000:83: Error handling if-stmt was detected but could not be rewritten.
+*/
+#define CU_CHECK(err)                                                          \
+    do { auto err_ = (err);                                                    \
+         if (err_ != 0) ggml_cuda_error(#err, __func__, __FILE__, __LINE__,    \
+                                        cu_get_error_str(err_)); } while (0)
+#endif
+
+#if DPCT_COMPAT_RT_VERSION >= 11100
+#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_CUDA_ASSUME(x)
+#endif // CUDART_VERSION >= 11100
+
+#ifdef GGML_CUDA_F16
+typedef half dfloat; // dequantize float
+typedef half2 dfloat2;
+#else
+typedef float dfloat; // dequantize float
+typedef sycl::float2 dfloat2;
+#endif //GGML_CUDA_F16
+
+static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __dpct_inline__ int get_int_from_uint8(const uint8_t *x8,
+                                              const int &i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __dpct_inline__ int get_int_from_int8_aligned(const int8_t *x8,
+                                                     const int &i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8,
+                                                      const int &i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+template <typename T>
+using to_t_cuda_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
+                             int k, dpct::queue_ptr stream);
+typedef to_t_cuda_t<float> to_fp32_cuda_t;
+typedef to_t_cuda_t<sycl::half> to_fp16_cuda_t;
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
+typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
+typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
+typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_cuda_op_mul_mat_t)(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream);
+typedef void (*ggml_cuda_op_flatten_t)(const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst, const float *src0_dd,
+                                       const float *src1_dd, float *dst_dd,
+                                       const dpct::queue_ptr &main_stream);
+
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization
+// QI = number of 32 bit integers before dequantization
+
+#define QK4_0 32
+#define QR4_0 2
+#define QI4_0 (QK4_0 / (4 * QR4_0))
+typedef struct dpct_type_471834 {
+    sycl::half d;           // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+#define QR4_1 2
+#define QI4_1 (QK4_1 / (4 * QR4_1))
+typedef struct dpct_type_143705 {
+    sycl::half2 dm;         // dm.x = delta, dm.y = min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+#define QR5_0 2
+#define QI5_0 (QK5_0 / (4 * QR5_0))
+typedef struct dpct_type_673649 {
+    sycl::half d;           // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+#define QR5_1 2
+#define QI5_1 (QK5_1 / (4 * QR5_1))
+typedef struct dpct_type_135589 {
+    sycl::half2 dm;         // dm.x = delta, dm.y = min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+#define QR8_0 1
+#define QI8_0 (QK8_0 / (4 * QR8_0))
+typedef struct dpct_type_122878 {
+    sycl::half d;           // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+#define QR8_1 1
+#define QI8_1 (QK8_1 / (4 * QR8_1))
+typedef struct dpct_type_143721 {
+    sycl::half2 ds;         // ds.x = delta, ds.y = sum
+    int8_t  qs[QK8_0];      // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
+
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef void (*allocate_tiles_cuda_t)(int **x_ql, sycl::half2 **x_dm,
+                                      int **x_qh, int **x_sc);
+typedef void (*load_tiles_cuda_t)(const void *__restrict__ vx,
+                                  int *__restrict__ x_ql,
+                                  sycl::half2 *__restrict__ x_dm,
+                                  int *__restrict__ x_qh,
+                                  int *__restrict__ x_sc, const int &i_offset,
+                                  const int &i_max, const int &k,
+                                  const int &blocks_per_row);
+typedef float (*vec_dot_q_mul_mat_cuda_t)(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
+    const int &i, const int &j, const int &k);
+
+//================================= k-quants
+
+#ifdef GGML_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
+typedef struct dpct_type_619598 {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    sycl::half2 dm;          // super-block scale for quantized scales/mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
+typedef struct dpct_type_138576 {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#ifdef GGML_QKK_64
+    uint8_t scales[2]; // scales, quantized with 8 bits
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    sycl::half d; // super-block scale
+} block_q3_K;
+//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
+
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
+#ifdef GGML_QKK_64
+typedef struct {
+    half    dm[2];             // super-block scales/mins
+    uint8_t scales[2];         // 4-bit block scales/mins
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
+#else
+typedef struct dpct_type_154943 {
+    sycl::half2 dm;            // super-block scale for quantized scales/mins
+    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
+#endif
+
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
+#ifdef GGML_QKK_64
+typedef struct {
+    half d;                  // super-block scale
+    int8_t scales[QK_K/16];  // block scales
+    uint8_t qh[QK_K/8];      // quants, high bit
+    uint8_t qs[QK_K/2];      // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
+#else
+typedef struct dpct_type_866817 {
+    sycl::half2 dm;               // super-block scale for quantized scales/mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // quants, high bit
+    uint8_t qs[QK_K/2];           // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+#endif
+
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
+typedef struct dpct_type_107281 {
+    uint8_t ql[QK_K/2];   // quants, lower 4 bits
+    uint8_t qh[QK_K/4];   // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales
+    sycl::half d;            // delta
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
+
+#define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
+
+#define CUDA_GELU_BLOCK_SIZE 256
+#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_TANH_BLOCK_SIZE 256
+#define CUDA_RELU_BLOCK_SIZE 256
+#define CUDA_SQR_BLOCK_SIZE 256
+#define CUDA_CPY_BLOCK_SIZE 32
+#define CUDA_SCALE_BLOCK_SIZE 256
+#define CUDA_CLAMP_BLOCK_SIZE 256
+#define CUDA_ROPE_BLOCK_SIZE 256
+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
+#define CUDA_ALIBI_BLOCK_SIZE 32
+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
+#define CUDA_QUANTIZE_BLOCK_SIZE 256
+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
+#define CUDA_UPSCALE_BLOCK_SIZE 256
+#define CUDA_CONCAT_BLOCK_SIZE 256
+#define CUDA_PAD_BLOCK_SIZE 256
+#define CUDA_ACC_BLOCK_SIZE 256
+#define CUDA_IM2COL_BLOCK_SIZE 256
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_CUDA_DMMV_X
+#define GGML_CUDA_DMMV_X 32
+#endif
+#ifndef GGML_CUDA_MMV_Y
+#define GGML_CUDA_MMV_Y 1
+#endif
+
+#ifndef K_QUANTS_PER_ITERATION
+#define K_QUANTS_PER_ITERATION 2
+#else
+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
+#endif
+
+#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
+#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
+#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+#define MUL_MAT_SRC1_COL_STRIDE 128
+
+#define MAX_STREAMS 8
+static dpct::queue_ptr g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = {
+    {&dpct::get_in_order_queue()}};
+
+struct ggml_tensor_extra_gpu {
+    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+    dpct::event_ptr
+        events[GGML_CUDA_MAX_DEVICES]
+              [MAX_STREAMS]; // events for synchronizing multiple GPUs
+};
+
+// this is faster on Windows
+// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
+inline dpct::err0 ggml_cuda_set_device(const int device) try {
+    int current_device;
+    CUDA_CHECK(current_device = dpct::dev_mgr::instance().current_device_id());
+
+    if (device == current_device) {
+        return 0;
+    }
+
+    /*
+    DPCT1093:68: The "device" device may be not the one intended for use. Adjust
+    the selected device if needed.
+    */
+    return DPCT_CHECK_ERROR(dpct::select_device(device));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static int g_device_count = -1;
+static int g_main_device = 0;
+static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+
+struct cuda_device_capabilities {
+    int     cc;                 // compute capability
+    bool    vmm;                // virtual memory support
+    size_t  vmm_granularity;    // granularity of virtual memory
+};
+
+static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
+
+
+static void * g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 0; // disabled by default
+static size_t g_scratch_offset = 0;
+
+static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+[[noreturn]]
+static void bad_arch(const sycl::stream &stream_ct1) {
+    stream_ct1 << "ERROR: ggml-cuda was compiled without support for the "
+                  "current GPU architecture.\n";
+    __trap();
+
+    (void) bad_arch; // suppress unused function warning
+}
+
+static __dpct_inline__ float warp_reduce_sum(float x,
+                                             const sycl::nd_item<3> &item_ct1) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:0: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        /*
+        DPCT1096:113: The right-most dimension of the work-group used in the
+        SYCL kernel that calls this function may be less than "32". The function
+        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
+        CPU device. Modify the size of the work-group to ensure that the value
+        of the right-most dimension is a multiple of "32".
+        */
+        x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
+    }
+    return x;
+}
+
+static __dpct_inline__ sycl::float2
+warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:1: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
+                                                mask);
+        /*
+        DPCT1023:2: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
+                                                mask);
+    }
+    return a;
+}
+
+static __dpct_inline__ float warp_reduce_max(float x,
+                                             const sycl::nd_item<3> &item_ct1) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:3: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        /*
+        DPCT1096:112: The right-most dimension of the work-group used in the
+        SYCL kernel that calls this function may be less than "32". The function
+        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
+        CPU device. Modify the size of the work-group to ensure that the value
+        of the right-most dimension is a multiple of "32".
+        */
+        x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
+                              item_ct1.get_sub_group(), x, mask));
+    }
+    return x;
+}
+
+static __dpct_inline__ float op_repeat(const float a, const float b) {
+    return b;
+}
+
+static __dpct_inline__ float op_add(const float a, const float b) {
+    return a + b;
+}
+
+static __dpct_inline__ float op_mul(const float a, const float b) {
+    return a * b;
+}
+
+static __dpct_inline__ float op_div(const float a, const float b) {
+    return a / b;
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1));
+    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) /
+                   ne3;
+    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) %
+                   ne3;
+
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0;
+         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
+        const int i10 = i0 % ne10;
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+    }
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    const int i3 = i/(ne2*ne1*ne0);
+    const int i2 = (i/(ne1*ne0)) % ne2;
+    const int i1 = (i/ne0) % ne1;
+    const int i0 = i % ne0;
+
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = i0 % ne10;
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+}
+
+static void acc_f32(const float * x, const float * y, float * dst, const int ne,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= ne) {
+        return;
+    }
+    int src1_idx = i - offset;
+    int oz = src1_idx / nb2;
+    int oy = (src1_idx - (oz * nb2)) / nb1;
+    int ox = src1_idx % nb1;
+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
+    } else {
+        dst[i] = x[i];
+    }
+}
+
+static void gelu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    float xi = x[i];
+    dst[i] = 0.5f * xi *
+             (1.0f +
+              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
+}
+
+static void silu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
+}
+
+static void gelu_quick_f32(const float *x, float *dst, int k,
+                           const sycl::nd_item<3> &item_ct1) {
+    const float GELU_QUICK_COEF = -1.702f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
+}
+
+static void tanh_f32(const float *x, float *dst, int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::tanh((float)(x[i]));
+}
+
+static void relu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0);
+}
+
+static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
+             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
+}
+
+static void sqr_f32(const float * x, float * dst, const int k,
+                    const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * x[i];
+}
+
+template <int block_size>
+static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
+                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    const int tid = item_ct1.get_local_id(2);
+
+    sycl::float2 mean_var = sycl::float2(0.f, 0.f);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        mean_var.x() += xi;
+        mean_var.y() += xi * xi;
+    }
+
+    // sum up partial sums
+    mean_var = warp_reduce_sum(mean_var, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        /*
+        DPCT1118:4: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        mean_var = s_sum[lane_id];
+        mean_var = warp_reduce_sum(mean_var, item_ct1);
+    }
+
+    const float mean = mean_var.x() / ncols;
+    const float var = mean_var.y() / ncols - mean * mean;
+    const float inv_std = sycl::rsqrt(var + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
+    }
+}
+
+static void concat_f32(const float  *x,const float  *y, float *dst, const int ne0, const int ne02,
+                       const sycl::nd_item<3> &item_ct1) {
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+    // operation
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    if (item_ct1.get_group(0) < ne02) { // src0
+        int offset_src =
+            nidx + item_ct1.get_group(1) * ne0 +
+            item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+            dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            nidx + item_ct1.get_group(1) * ne0 +
+            (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
+            dst[offset_dst] = y[offset_src];
+    }
+}
+
+static void upscale_f32(const float  *x, float *dst, const int ne00, const int nb02, const int scale_factor,
+                        const sycl::nd_item<3> &item_ct1) {
+    int ne0 = ne00 * scale_factor;
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+    // operation
+    int i00 = nidx / scale_factor;
+    int i01 = item_ct1.get_group(1) / scale_factor;
+    int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    dst[offset_dst] = x[offset_src];
+}
+
+static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
+                    const sycl::nd_item<3> &item_ct1) {
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
+        item_ct1.get_group(0) < ne02) {
+        int offset_src = nidx + item_ct1.get_group(1) * ne00 +
+                         item_ct1.get_group(0) * ne00 * ne01;
+            dst[offset_dst] = x[offset_src];
+    } else {
+        dst[offset_dst] = 0.0f;
+    }
+}
+
+template <int block_size>
+static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
+                           const sycl::nd_item<3> &item_ct1, float *s_sum) {
+    int start = item_ct1.get_group(2) * group_size;
+    int end = start + group_size;
+
+    start += item_ct1.get_local_id(2);
+
+    if (end >= ne_elements) {
+        end = ne_elements;
+    }
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int j = start; j < end; j += block_size) {
+        tmp += x[j];
+    }
+
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:5: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:69: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    float mean = tmp / group_size;
+    tmp = 0.0f;
+
+    for (int j = start; j < end; j += block_size) {
+        float xi = x[j] - mean;
+        dst[j] = xi;
+        tmp += xi * xi;
+    }
+
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:6: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:70: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    float variance = tmp / group_size;
+    float scale = sycl::rsqrt(variance + eps);
+    for (int j = start; j < end; j += block_size) {
+        dst[j] *= scale;
+    }
+}
+
+template <int block_size>
+static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
+                         const sycl::nd_item<3> &item_ct1, float *s_sum) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    const int tid = item_ct1.get_local_id(2);
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        tmp += xi * xi;
+    }
+
+    // sum up partial sums
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:7: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    const float mean = tmp / ncols;
+    const float scale = sycl::rsqrt(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = scale * x[row*ncols + col];
+    }
+}
+
+static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {8.0f, 8.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x() = (v.x() - 8.0f) * d;
+    v.y() = (v.y() - 8.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = x[ib].dm[1];
+    const dfloat m = x[ib].dm[0];
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x() = (v.x() * d) + m;
+    v.y() = (v.y() * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {16.0f, 16.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x() = (v.x() - 16.0f) * d;
+    v.y() = (v.y() - 16.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = x[ib].dm[1];
+    const dfloat m = x[ib].dm[0];
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x() = (v.x() * d) + m;
+    v.y() = (v.y() * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x() = x[ib].qs[iqs + 0];
+    v.y() = x[ib].qs[iqs + 1];
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+#else
+    v.x() *= d;
+    v.y() *= d;
+#endif // GGML_CUDA_F16
+}
+
+//================================== k-quants
+
+template<typename dst_t>
+static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
+    const int is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    float dall = x[i].dm[1];
+    float dmin = x[i].dm[0];
+    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
+    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
+    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
+#else
+    const int is = tid/16;  // 0 or 1
+    const int il = tid%16;  // 0...15
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    dst_t * y = yy + i*QK_K + 16*is + il;
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
+    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
+#endif
+
+}
+
+template<typename dst_t>
+static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+#if QK_K == 256
+    const int r = item_ct1.get_local_id(2) / 4;
+    const int tid = r/2;
+    const int is0 = r%2;
+    const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
+    const int n = tid / 4;
+    const int j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    float d_all = x[i].d;
+    float dl = d_all * (us - 32);
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+#else
+    const int tid = threadIdx.x;
+    const int is  = tid/16;  // 0 or 1
+    const int il  = tid%16;  // 0...15
+    const int im  = il/8;    // 0...1
+    const int in  = il%8;    // 0...7
+
+    dst_t * y = yy + i*QK_K + 16*is + il;
+
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    const uint8_t h = x[i].hmask[in] >> (2*is + im);
+    const float   d = (float)x[i].d;
+
+    if (is == 0) {
+        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    } else {
+        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    }
+#endif
+
+}
+
+#if QK_K == 256
+static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+#endif
+
+template<typename dst_t>
+static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 32 threads
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const float dall = x[i].dm[1];
+    const float dmin = x[i].dm[0];
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l +32] = d2 * (q[l] >>  4) - m2;
+    }
+#else
+    const int tid = threadIdx.x;
+    const uint8_t * q = x[i].qs;
+    dst_t * y = yy + i*QK_K;
+    const float d = (float)x[i].dm[0];
+    const float m = (float)x[i].dm[1];
+    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
+    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = x[i].dm[1];
+    const float dmin = x[i].dm[0];
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+#else
+    const int tid = threadIdx.x;
+    const uint8_t q = x[i].qs[tid];
+    const int im = tid/8;  // 0...3
+    const int in = tid%8;  // 0...7
+    const int is = tid/16; // 0 or 1
+    const uint8_t h = x[i].qh[in] >> im;
+    const float d = x[i].d;
+    dst_t * y = yy + i*QK_K + tid;
+    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
+    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+#if QK_K == 256
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = item_ct1.get_local_id(2);
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+#else
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int ip  = tid/16;         // 0 or 1
+    const int il  = tid - 16*ip;    // 0...15
+
+    dst_t * y = yy + i*QK_K + 16*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t   ql = x[i].ql[16*ip + il];
+    const uint8_t   qh = x[i].qh[il] >> (2*ip);
+    const int8_t  * sc = x[i].scales;
+
+    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+#endif
+}
+
+/*
+DPCT1110:8: The total declared local variable size in device function
+dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q2_K * x = (const block_q2_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
+    const int q_offset = 32*im + l0;
+    const int s_offset = 8*im;
+    const int y_offset = 128*im + l0;
+
+    uint32_t aux[4];
+    const uint8_t * d = (const uint8_t *)aux;
+    const uint8_t * m = (const uint8_t *)(aux + 2);
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+
+        const float dall = x[i].dm[1];
+        const float dmin = x[i].dm[0];
+
+        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
+        aux[0] = a[0] & 0x0f0f0f0f;
+        aux[1] = a[1] & 0x0f0f0f0f;
+        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
+        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
+            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
+
+        }
+        tmp += dall * sum1 - dmin * sum2;
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;
+
+    uint32_t uaux[2];
+    const uint8_t * d = (const uint8_t *)uaux;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint32_t * s = (const uint32_t *)x[i].scales;
+
+        uaux[0] = s[0] & 0x0f0f0f0f;
+        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
+
+        const float2 dall = __half22float2(x[i].dm);
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t ql = q[l];
+            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
+                  + y[l+16] * d[1] * ((ql >> 2) & 3)
+                  + y[l+32] * d[2] * ((ql >> 4) & 3)
+                  + y[l+48] * d[3] * ((ql >> 6) & 3);
+            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
+        }
+        tmp += dall.x * sum1 - dall.y * sum2;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:9: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:10: The total declared local variable size in device function
+dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q3_K * x = (const block_q3_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+
+    const uint16_t kmask1 = 0x0303;
+    const uint16_t kmask2 = 0x0f0f;
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
+    const int step = 16/K_QUANTS_PER_ITERATION;
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0....15 or 0...7
+
+    const uint8_t m = 1 << (4*im);
+
+    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
+    const int q_offset =  32*im + l0;
+    const int y_offset = 128*im + l0;
+
+    uint16_t utmp[4];
+    const int8_t * s = (const int8_t *)utmp;
+
+    const uint16_t s_shift = 4*im;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+        const uint8_t * h = x[i].hmask + l0;
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
+        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
+
+        const float d = x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < n; ++l) {
+            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
+            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
+        }
+        tmp += d * sum;
+
+    }
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
+    const int in = offset/8;                                 // 0 or 1
+    const int im = offset%8;                                 // 0...7
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint8_t * s = x[i].scales;
+
+        const float dall = (float)x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t hl = x[i].hmask[im+l] >> in;
+            const uint8_t ql = q[l];
+            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
+                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
+                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
+                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:11: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:12: The total declared local variable size in device function
+dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q4_K * x = (const block_q4_K *)vx + ib0;
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
+
+    const int il  = tid/step;                            // 0...3
+    const int ir  = tid - step*il;                       // 0...7 or 0...3
+    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+#if K_QUANTS_PER_ITERATION == 2
+    uint32_t q32[4];
+    const uint8_t * q4 = (const uint8_t *)q32;
+#else
+    uint16_t q16[4];
+    const uint8_t * q4 = (const uint8_t *)q16;
+#endif
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y1 = yy + i*QK_K + y_offset;
+        const float   * y2 = y1 + 128;
+
+        const float dall = x[i].dm[1];
+        const float dmin = x[i].dm[0];
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+#if K_QUANTS_PER_ITERATION == 2
+        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
+        const uint32_t * q2 = q1 + 16;
+
+        q32[0] = q1[0] & 0x0f0f0f0f;
+        q32[1] = q1[0] & 0xf0f0f0f0;
+        q32[2] = q2[0] & 0x0f0f0f0f;
+        q32[3] = q2[0] & 0xf0f0f0f0;
+
+        sycl::float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 4; ++l) {
+            s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4];
+            s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
+                       s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
+               dmin * smin;
+#else
+        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
+        const uint16_t * q2 = q1 + 32;
+
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[0] & 0xf0f0;
+        q16[2] = q2[0] & 0x0f0f;
+        q16[3] = q2[0] & 0xf0f0;
+
+        float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 2; ++l) {
+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
+            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
+#endif
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    float tmp = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const float   * y = yy + i*QK_K + step;
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+        const float d = (float)x[i].dm[0];
+        const float m = (float)x[i].dm[1];
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
+                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
+                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
+                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
+        }
+        tmp += sum;
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:13: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:14: The total declared local variable size in device function
+dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2);
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q5_K * x = (const block_q5_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = item_ct1.get_local_id(2) / 2; // 0...15
+    const int ix = item_ct1.get_local_id(2) % 2;
+
+    const int il  = tid/4;     // 0...3
+    const int ir  = tid - 4*il;// 0...3
+    const int n   = 2;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1  = 1 << (2*im);
+    const uint8_t hm2  = hm1 << 4;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    uint16_t q16[8];
+    const uint8_t * q4 = (const uint8_t *)q16;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2) {
+
+        const uint8_t * ql1 = x[i].qs + q_offset;
+        const uint8_t * qh  = x[i].qh + l0;
+        const float   * y1  = yy + i*QK_K + y_offset;
+        const float   * y2  = y1 + 128;
+
+        const float dall = x[i].dm[1];
+        const float dmin = x[i].dm[0];
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        const uint16_t * q1 = (const uint16_t *)ql1;
+        const uint16_t * q2 = q1 + 32;
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[8] & 0x0f0f;
+        q16[2] = (q1[0] >> 4) & 0x0f0f;
+        q16[3] = (q1[8] >> 4) & 0x0f0f;
+        q16[4] = q2[0] & 0x0f0f;
+        q16[5] = q2[8] & 0x0f0f;
+        q16[6] = (q2[0] >> 4) & 0x0f0f;
+        q16[7] = (q2[8] >> 4) & 0x0f0f;
+        for (int l = 0; l < n; ++l) {
+            sum.x() +=
+                y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
+                y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
+            sum.y() +=
+                y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
+                y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
+            sum.z() +=
+                y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
+                y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
+            sum.w() +=
+                y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
+                y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
+        }
+        tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
+                       sum.w() * sc[5]) -
+               dmin * smin;
+    }
+
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+    const int step = tid * K_QUANTS_PER_ITERATION;
+    const int im = step/8;
+    const int in = step%8;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const int8_t  * s = x[i].scales;
+        const float   * y = yy + i*QK_K + step;
+        const float     d = x[i].d;
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            const uint8_t h = x[i].qh[in+j] >> im;
+            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
+                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
+                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
+                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:15: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q6_K * x = (const block_q6_K *)vx + ib0;
+
+#if QK_K == 256
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+#if K_QUANTS_PER_ITERATION == 1
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
+    const int is = 0;
+#else
+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
+    const int is = in / 4;
+#endif
+    const int ql_offset = 64*im + l0;
+    const int qh_offset = 32*im + l0;
+    const int s_offset  =  8*im + is;
+    const int y_offset = 128*im + l0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * ql = x[i].ql + ql_offset;
+        const uint8_t * qh = x[i].qh + qh_offset;
+        const int8_t  * s  = x[i].scales + s_offset;
+
+        const float d = x[i].d;
+
+#if K_QUANTS_PER_ITERATION == 1
+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
+        tmp += sum;
+#else
+        float sum = 0;
+        for (int l = 0; l < 4; ++l) {
+            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
+        }
+        tmp += sum;
+#endif
+
+    }
+
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + step;
+        const uint8_t * ql = x[i].ql + step;
+        const uint8_t * qh = x[i].qh + step;
+        const int8_t  * s  = x[i].scales;
+
+        const float d = x[i+0].d;
+
+        float sum = 0;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
+                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
+                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
+                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
+        }
+        tmp += sum;
+
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:16: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const sycl::half *x = (const sycl::half *)vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+
+static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const float * x = (const float *) vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+
+static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                   item_ct1.get_local_id(2);
+
+    if (ix >= kx_padded) {
+        return;
+    }
+
+    const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                   item_ct1.get_local_id(1);
+
+    const int i_padded = iy*kx_padded + ix;
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int ib = i_padded / QK8_1; // block index
+    const int iqs = i_padded % QK8_1; // quant index
+
+    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
+    float amax = sycl::fabs((float)xi);
+    float sum = xi;
+
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:17: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
+                                    item_ct1.get_sub_group(), amax, mask));
+        /*
+        DPCT1023:18: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        sum +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
+    }
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
+    reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
+}
+
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void k_get_rows(
+            const void * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                     item_ct1.get_local_id(2)) *
+                    2;
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int ib = i00/qk; // block index
+    const int iqs = (i00%qk)/qr; // quant index
+    const int iybs = i00 - i00%qk; // dst block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(src0_row, ib, iqs, v);
+
+    dst_row[iybs + iqs + 0] = v.x();
+    dst_row[iybs + iqs + y_offset] = v.y();
+}
+
+template<typename src0_t, typename dst_t>
+static void k_get_rows_float(
+            const src0_t * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                    item_ct1.get_local_id(2);
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
+
+    dst_row[i00] = src0_row[i00];
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
+                             const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  2 * item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0] = v.x();
+    y[iybs + iqs + y_offset] = v.y();
+}
+
+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
+
+#define VDR_Q4_0_Q8_1_MMVQ 2
+#define VDR_Q4_0_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q4_0_q8_1_impl(const int *v, const int *u, const float &d4,
+                       const sycl::half2 &ds8, const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 8 from each quant value
+    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_1_Q8_1_MMVQ 2
+#define VDR_Q4_1_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4,
+                       const sycl::half2 &ds8, const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
+    const float d4d8 = tmp.x;
+    const float m4s8 = tmp.y;
+#else
+    const float2 dm4f = __half22float2(dm4);
+    const float2 ds8f = __half22float2(ds8);
+    const float d4d8 = dm4f.x * ds8f.x;
+    const float m4s8 = dm4f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
+    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_0_Q8_1_MMVQ 2
+#define VDR_Q5_0_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
+                       const float &d5, const sycl::half2 &ds8,
+                       const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 16 from each quant value
+    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_1_Q8_1_MMVQ 2
+#define VDR_Q5_1_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
+                       const sycl::half2 &dm5, const sycl::half2 &ds8,
+                       const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
+    const float d5d8 = tmp.x;
+    const float m5s8 = tmp.y;
+#else
+    const float2 dm5f = __half22float2(dm5);
+    const float2 ds8f = __half22float2(ds8);
+    const float d5d8 = dm5f.x * ds8f.x;
+    const float m5s8 = dm5f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
+    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
+
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q8_0_Q8_1_MMVQ 2
+#define VDR_Q8_0_Q8_1_MMQ 8
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q8_0_q8_1_impl(const int *v, const int *u, const float &d8_0,
+                       const float &d8_1, const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+    return d8_0*d8_1 * sumi;
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8,
+                       const sycl::half2 &ds8, const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
+    const float d8d8 = tmp.x;
+    const float m8s8 = tmp.y;
+#else
+    const float2 dm8f = __half22float2(dm8);
+    const float2 ds8f = __half22float2(ds8);
+    const float d8d8 = dm8f.x * ds8f.x;
+    const float m8s8 = dm8f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
+    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q2_K_Q8_1_MMVQ 1
+#define VDR_Q2_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
+    const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
+    const sycl::half2 &dm2, const float *__restrict__ d8,
+    const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = scales[2*i];
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+
+        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return dm2f.x*sumf_d - dm2f.y*sumf_m;
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const uint8_t *__restrict__ scales,
+                           const sycl::half2 &dm2, const float &d8,
+                           const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi_d = 0;
+    int sumi_m = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
+        int sumi_d_sc = 0;
+
+        const int sc = scales[i0 / (QI8_1/2)];
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
+            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
+        }
+
+        sumi_d += sumi_d_sc * (sc & 0xF);
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q3_K_Q8_1_MMVQ 1
+#define VDR_Q3_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
+    const int &vl, const int &vh, const int *__restrict__ u,
+    const uint8_t *__restrict__ scales, const int &scale_offset,
+    const float &d3, const float *__restrict__ d8,
+    const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi = __vsubss4(vil, vih);
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d3 * sumf;
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const int8_t *__restrict__ scales, const float &d3,
+                           const float &d8, const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
+        int sumi_sc = 0;
+
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+        }
+
+        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
+    }
+
+    return d3*d8 * sumi;
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_K_Q8_1_MMVQ 2
+#define VDR_Q4_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const float *__restrict__ d8,
+    const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K; ++i) {
+        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
+        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8,
+    const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_K_Q8_1_MMVQ 2
+#define VDR_Q5_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
+    const int *__restrict__ vl, const int *__restrict__ vh,
+    const int *__restrict__ u, const uint8_t *__restrict__ sc,
+    const uint8_t *__restrict__ m, const sycl::half2 &dm5,
+    const float *__restrict__ d8, const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
+        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
+        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
+
+        const int v0i = vl0i | vh0i;
+        const int v1i = vl1i | vh1i;
+
+        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);
+
+    }
+
+    const float2 dm5f = __half22float2(dm5);
+
+    return dm5f.x*sumf_d - dm5f.y*sumf_m;
+
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8,
+    const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q6_K_Q8_1_MMVQ 1
+#define VDR_Q6_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(
+    const int &vl, const int &vh, const int *__restrict__ u,
+    const int8_t *__restrict__ scales, const float &d,
+    const float *__restrict__ d8, const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = scales[4*i];
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
+
+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const int8_t *__restrict__ sc, const float &d6,
+                           const float *__restrict__ d8,
+                           const sycl::stream &stream_ct1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
+        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
+
+#pragma unroll
+        for (int i = i0; i < i0 + 2; ++i) {
+            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
+            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
+
+            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
+            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
+        }
+
+        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
+    }
+
+    return d6 * sumf_d;
+
+#else
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __dpct_inline__ float
+vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+
+    int v[VDR_Q4_0_Q8_1_MMVQ];
+    int u[2*VDR_Q4_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
+        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds,
+                                                      stream_ct1);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs, float *tile_x_d) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs;
+    *x_dm = (sycl::half2 *)tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_0;
+    const int kqsx = k % QI4_0;
+
+    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
+
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const float * x_dmf = (const float *) x_dm;
+
+    int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>(
+        &x_ql[i * (WARP_SIZE + 1) + k], u,
+        x_dmf[i * (WARP_SIZE / QI4_0) + i / QI4_0 + k / QI4_0],
+        y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)],
+        stream_ct1);
+}
+
+static __dpct_inline__ float
+vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    int v[VDR_Q4_1_Q8_1_MMVQ];
+    int u[2*VDR_Q4_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
+        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm,
+                                                      bq8_1->ds, stream_ct1);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs, sycl::half2 *tile_x_dm) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_1;
+    const int kqsx = k % QI4_1;
+
+    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+
+    int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>(
+        &x_ql[i * (WARP_SIZE + 1) + k], u,
+        x_dm[i * (WARP_SIZE / QI4_1) + i / QI4_1 + k / QI4_1],
+        y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)],
+        stream_ct1);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+
+    int vl[VDR_Q5_0_Q8_1_MMVQ];
+    int vh[VDR_Q5_0_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
+        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
+        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
+    }
+
+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d,
+                                                      bq8_1->ds, stream_ct1);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, float *tile_x_d) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_ql;
+    *x_dm = (sycl::half2 *)tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_0;
+    const int kqsx = k % QI5_0;
+
+    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
+
+        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
+        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
+        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
+        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
+        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
+        qs0 = dpct::vectorized_binary<sycl::char4>(
+            qs0, 0x10101010, dpct::sub_sat()); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
+        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
+        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
+        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
+        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
+        qs1 = dpct::vectorized_binary<sycl::char4>(
+            qs1, 0x10101010, dpct::sub_sat()); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    int u[2*VDR_Q5_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_0_q8_1_impl<QR5_0 * VDR_Q5_0_Q8_1_MMQ>(
+        &x_ql[i * (2 * WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx],
+        y_df[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)],
+        stream_ct1);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    int vl[VDR_Q5_1_Q8_1_MMVQ];
+    int vh[VDR_Q5_1_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
+        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
+        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
+    }
+
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm,
+                                                      bq8_1->ds, stream_ct1);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset < nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_1;
+    const int kqsx = k % QI5_1;
+
+    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
+
+        int qs0 = (ql >>  0) & 0x0F0F0F0F;
+        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
+        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
+        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4) & 0x0F0F0F0F;
+        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
+
+    int u[2*VDR_Q5_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_1_q8_1_impl<QR5_1 * VDR_Q5_1_Q8_1_MMQ>(
+        &x_ql[i * (2 * WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx],
+        y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)],
+        stream_ct1);
+}
+
+static __dpct_inline__ float
+vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    int v[VDR_Q8_0_Q8_1_MMVQ];
+    int u[VDR_Q8_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
+        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
+        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+    }
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
+                                                      bq8_1->ds[1], stream_ct1);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs, float *tile_x_d) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs;
+    *x_dm = (sycl::half2 *)tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI8_0;
+    const int kqsx = k % QI8_0;
+    float * x_dmf = (float *) x_dm;
+
+    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    (void)x_qh; (void)x_sc;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>(
+        &x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k],
+        x_dmf[i * (WARP_SIZE / QI8_0) + i / QI8_0 + k / QI8_0],
+        y_df[j * (WARP_SIZE / QI8_1) + k / QI8_1], stream_ct1);
+}
+
+static __dpct_inline__ float
+vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const uint8_t * scales = bq2_K->scales + scale_offset;
+
+    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
+    int    u[QR2_K];
+    float d8[QR2_K];
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++ i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + i].ds[1];
+    }
+
+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8, stream_ct1);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI2_K;
+    const int kqsx = k % QI2_K;
+
+    const block_q2_K * bx0 = (const block_q2_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
+    }
+}
+
+static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    (void)x_qh;
+
+    const int kbx = k / QI2_K;
+    const int ky  = (k % QI2_K) * QR2_K;
+    const float * y_df = (const float *) y_ds;
+
+    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
+
+    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
+
+#pragma unroll
+    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
+        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
+    }
+
+    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+
+    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
+    return vec_dot_q2_K_q8_1_impl_mmq(
+        v, &y_qs[index_y], scales,
+        x_dm[i * (WARP_SIZE / QI2_K) + i / QI2_K + kbx], y_df[index_y / QI8_1],
+        stream_ct1);
+}
+
+static __dpct_inline__ float
+vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const float d = bq3_K->d;
+
+    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
+
+    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
+
+    int    u[QR3_K];
+    float d8[QR3_K];
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + i].ds[1];
+    }
+
+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset,
+                                       d, d8, stream_ct1);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh,
+                    int *tile_x_sc) {
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_qh = tile_x_qh;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI3_K;
+    const int kqsx = k % QI3_K;
+
+    const block_q3_K * bx0 = (const block_q3_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
+        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
+
+        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
+
+        const int ksc = k % (QI3_K/4);
+
+        const int ksc_low = ksc % (QI3_K/8);
+        const int shift_low = 4 * (ksc / (QI3_K/8));
+        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+        const int ksc_high = QI3_K/8;
+        const int shift_high = 2 * ksc;
+        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+        const int sc = dpct::vectorized_binary<sycl::char4>(
+            sc_low | sc_high, 0x20202020, dpct::sub_sat());
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+
+    const int kbx  = k / QI3_K;
+    const int ky  = (k % QI3_K) * QR3_K;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
+
+    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
+        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+        const int shift = 2 * ((ky % 32) / 8);
+        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
+
+        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+        const int vlh = (vh << 2) & 0x04040404;
+
+        v[l] = dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat());
+    }
+
+    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
+    return vec_dot_q3_K_q8_1_impl_mmq(
+        v, &y_qs[index_y], scales,
+        x_dmf[i * (WARP_SIZE / QI3_K) + i / QI3_K + kbx], y_df[index_y / QI8_1],
+        stream_ct1);
+}
+
+static __dpct_inline__ float
+vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+#ifndef GGML_QKK_64
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    int    v[2];
+    int    u[2*QR4_K];
+    float d8[QR4_K];
+
+    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
+    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
+
+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
+    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
+    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
+    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
+
+    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    v[0] = q4[0];
+    v[1] = q4[4];
+
+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = bq8i->ds[1];
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8, stream_ct1);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    const uint16_t * a = (const uint16_t *)bq4_K->scales;
+    aux16[0] = a[0] & 0x0f0f;
+    aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+    const float dall = bq4_K->dm[0];
+    const float dmin = bq4_K->dm[1];
+
+    const float d8_1 = __low2float(bq8_1[0].ds);
+    const float d8_2 = __low2float(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
+    const int v1 = q4[0];
+    const int v2 = q4[4];
+
+    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
+    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
+    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
+    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
+
+    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
+    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
+
+    return dall * sumf_d - dmin * sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI4_K; // == k if QK_K == 256
+
+    const block_q4_K * bx0 = (const block_q4_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+#else
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
+
+    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k],
+                                      &y_qs[index_y], sc, sc + 8,
+                                      x_dm[i * (WARP_SIZE / QI4_K) + i / QI4_K],
+                                      &y_ds[index_y / QI8_1], stream_ct1);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+#ifndef GGML_QKK_64
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    int   vl[2];
+    int   vh[2];
+    int    u[2*QR5_K];
+    float d8[QR5_K];
+
+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
+    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
+
+    vl[0] = ql[0];
+    vl[1] = ql[4];
+
+    vh[0] = qh[0] >> bq8_offset;
+    vh[1] = qh[4] >> bq8_offset;
+
+    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = bq8i->ds[0];
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8,
+                                       stream_ct1);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    const int8_t * s = bq5_K->scales;
+
+    const float d = bq5_K->d;
+
+    const float d8_1 = __low2half(bq8_1[0].ds);
+    const float d8_2 = __low2half(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * ql = (const int *)bq5_K->qs + (iqs/2);
+    const int vl1 = ql[0];
+    const int vl2 = ql[4];
+
+    const int step = 4 * (iqs/2); // 0, 4, 8, 12
+    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
+    const int in = step%8; // 0, 4, 0, 4
+    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
+
+    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
+    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
+    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
+    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
+
+    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
+                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
+
+    return d * sumf_d;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI5_K; // == k if QK_K == 256
+
+    const block_q5_K * bx0 = (const block_q5_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR5_K*kqsx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
+        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
+
+    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
+    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
+    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc,
+                                      sc + 8,
+                                      x_dm[i * (WARP_SIZE / QI5_K) + i / QI5_K],
+                                      &y_ds[index_y / QI8_1], stream_ct1);
+}
+
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
+                  const sycl::stream &stream_ct1) {
+
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
+    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
+
+    const int8_t * scales = bq6_K->scales + scale_offset;
+
+    int    u[QR6_K];
+    float d8[QR6_K];
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + 2 * i].ds[1];
+    }
+
+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8,
+                                       stream_ct1);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI6_K; // == k if QK_K == 256
+
+    const block_q6_K * bx0 = (const block_q6_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR6_K*kqsx;
+
+        const int ql = get_int_from_uint8(bxi->ql, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
+        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
+        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
+
+        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
+        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
+
+        x_ql[i * (2 * WARP_SIZE + 1) + kq0] =
+            dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020,
+                                                 dpct::sub_sat());
+        x_ql[i * (2 * WARP_SIZE + 1) + kq1] =
+            dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020,
+                                                 dpct::sub_sat());
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
+    }
+}
+
+static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    (void)x_qh;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
+
+    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
+    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
+    return vec_dot_q6_K_q8_1_impl_mmq(
+        &x_ql[index_x], &y_qs[index_y], sc,
+        x_dmf[i * (WARP_SIZE / QI6_K) + i / QI6_K], &y_df[index_y / QI8_1],
+        stream_ct1);
+}
+
+template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
+          int mmq_y, int nwarps, allocate_tiles_cuda_t allocate_tiles,
+          load_tiles_cuda_t load_tiles, int vdr,
+          vec_dot_q_mul_mat_cuda_t vec_dot>
+/*
+DPCT1110:19: The total declared local variable size in device function mul_mat_q
+exceeds 128 bytes and may cause high register pressure. Consult with your
+hardware vendor to find the total register size available and adjust the code,
+or use smaller sub-group size to avoid high register pressure.
+*/
+static __dpct_inline__ void
+mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
+          float *__restrict__ dst, const int ncols_x, const int nrows_x,
+          const int ncols_y, const int nrows_y, const int nrows_dst,
+          const sycl::nd_item<3> &item_ct1, int *tile_y_qs,
+          sycl::half2 *tile_y_ds) {
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    const int blocks_per_row_x = ncols_x / qk;
+    const int blocks_per_col_y = nrows_y / QK8_1;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+    const int & ncols_dst = ncols_y;
+
+    const int row_dst_0 = item_ct1.get_group(2) * mmq_y;
+    const int & row_x_0 = row_dst_0;
+
+    const int col_dst_0 = item_ct1.get_group(1) * mmq_x;
+    const int & col_y_0 = col_dst_0;
+
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
+
+    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+
+        load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
+                   tile_x_qh, tile_x_sc, item_ct1.get_local_id(1),
+                   nrows_x - row_x_0 - 1, item_ct1.get_local_id(2),
+                   blocks_per_row_x);
+
+#pragma unroll
+        for (int ir = 0; ir < qr; ++ir) {
+            const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2);
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = dpct::min(
+                    (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i),
+                    ncols_y - 1); // to prevent out-of-bounds memory accesses
+
+                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
+
+                const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE +
+                                    kqs % WARP_SIZE;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(
+                    by0->qs, item_ct1.get_local_id(2) % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids =
+                    (ids0 + item_ct1.get_local_id(1) * QI8_1 +
+                     item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) %
+                    mmq_x;
+                const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1);
+                const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const sycl::half2 *dsi_src =
+                    &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) +
+                       ir * (WARP_SIZE / QI8_1) + kby]
+                         .ds;
+                sycl::half2 *dsi_dst =
+                    &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = (*dsi_src)[1];
+                }
+            }
+
+            /*
+            DPCT1118:20: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:71: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
+#pragma unroll
+                for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+                        sum[i / WARP_SIZE][j / nwarps] += vec_dot(
+                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                            tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i,
+                            item_ct1.get_local_id(1) + j, k);
+                    }
+                }
+            }
+
+            /*
+            DPCT1118:21: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:72: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+        }
+    }
+
+#pragma unroll
+    for (int j = 0; j < mmq_x; j += nwarps) {
+        const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1);
+
+        if (col_dst >= ncols_dst) {
+            return;
+        }
+
+#pragma unroll
+        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+            const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i;
+
+            if (row_dst >= nrows_dst) {
+                continue;
+            }
+
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
+        }
+    }
+}
+
+#define  MMQ_X_Q4_0_RDNA2  64
+#define  MMQ_Y_Q4_0_RDNA2  128
+#define NWARPS_Q4_0_RDNA2  8
+#define  MMQ_X_Q4_0_RDNA1  64
+#define  MMQ_Y_Q4_0_RDNA1  64
+#define NWARPS_Q4_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_0_AMPERE 4
+#define  MMQ_Y_Q4_0_AMPERE 32
+#define NWARPS_Q4_0_AMPERE 4
+#else
+#define  MMQ_X_Q4_0_AMPERE 64
+#define  MMQ_Y_Q4_0_AMPERE 128
+#define NWARPS_Q4_0_AMPERE 4
+#endif
+#define  MMQ_X_Q4_0_PASCAL 64
+#define  MMQ_Y_Q4_0_PASCAL 64
+#define NWARPS_Q4_0_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q4_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+    const int nwarps = NWARPS_Q4_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+    const int nwarps = NWARPS_Q4_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+    const int nwarps = NWARPS_Q4_0_AMPERE;
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+    const int nwarps = NWARPS_Q4_0_PASCAL;
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_0_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_1_RDNA2  64
+#define  MMQ_Y_Q4_1_RDNA2  128
+#define NWARPS_Q4_1_RDNA2  8
+#define  MMQ_X_Q4_1_RDNA1  64
+#define  MMQ_Y_Q4_1_RDNA1  64
+#define NWARPS_Q4_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_1_AMPERE 4
+#define  MMQ_Y_Q4_1_AMPERE 32
+#define NWARPS_Q4_1_AMPERE 4
+#else
+#define  MMQ_X_Q4_1_AMPERE 64
+#define  MMQ_Y_Q4_1_AMPERE 128
+#define NWARPS_Q4_1_AMPERE 4
+#endif
+#define  MMQ_X_Q4_1_PASCAL 64
+#define  MMQ_Y_Q4_1_PASCAL 64
+#define NWARPS_Q4_1_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+    const int nwarps = NWARPS_Q4_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+    const int nwarps = NWARPS_Q4_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+    const int nwarps = NWARPS_Q4_1_AMPERE;
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+    const int nwarps = NWARPS_Q4_1_PASCAL;
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_1_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_0_RDNA2  64
+#define  MMQ_Y_Q5_0_RDNA2  128
+#define NWARPS_Q5_0_RDNA2  8
+#define  MMQ_X_Q5_0_RDNA1  64
+#define  MMQ_Y_Q5_0_RDNA1  64
+#define NWARPS_Q5_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_0_AMPERE 4
+#define  MMQ_Y_Q5_0_AMPERE 32
+#define NWARPS_Q5_0_AMPERE 4
+#else
+#define  MMQ_X_Q5_0_AMPERE 128
+#define  MMQ_Y_Q5_0_AMPERE 64
+#define NWARPS_Q5_0_AMPERE 4
+#endif
+#define  MMQ_X_Q5_0_PASCAL 64
+#define  MMQ_Y_Q5_0_PASCAL 64
+#define NWARPS_Q5_0_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q5_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+    const int nwarps = NWARPS_Q5_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+    const int nwarps = NWARPS_Q5_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+    const int nwarps = NWARPS_Q5_0_AMPERE;
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+    const int nwarps = NWARPS_Q5_0_PASCAL;
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_0_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_1_RDNA2  64
+#define  MMQ_Y_Q5_1_RDNA2  128
+#define NWARPS_Q5_1_RDNA2  8
+#define  MMQ_X_Q5_1_RDNA1  64
+#define  MMQ_Y_Q5_1_RDNA1  64
+#define NWARPS_Q5_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_1_AMPERE 4
+#define  MMQ_Y_Q5_1_AMPERE 32
+#define NWARPS_Q5_1_AMPERE 4
+#else
+#define  MMQ_X_Q5_1_AMPERE 128
+#define  MMQ_Y_Q5_1_AMPERE 64
+#define NWARPS_Q5_1_AMPERE 4
+#endif
+#define  MMQ_X_Q5_1_PASCAL 64
+#define  MMQ_Y_Q5_1_PASCAL 64
+#define NWARPS_Q5_1_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+    const int nwarps = NWARPS_Q5_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+    const int nwarps = NWARPS_Q5_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+    const int nwarps = NWARPS_Q5_1_AMPERE;
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+    const int nwarps = NWARPS_Q5_1_PASCAL;
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_1_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q8_0_RDNA2  64
+#define  MMQ_Y_Q8_0_RDNA2  128
+#define NWARPS_Q8_0_RDNA2  8
+#define  MMQ_X_Q8_0_RDNA1  64
+#define  MMQ_Y_Q8_0_RDNA1  64
+#define NWARPS_Q8_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q8_0_AMPERE 4
+#define  MMQ_Y_Q8_0_AMPERE 32
+#define NWARPS_Q8_0_AMPERE 4
+#else
+#define  MMQ_X_Q8_0_AMPERE 128
+#define  MMQ_Y_Q8_0_AMPERE 64
+#define NWARPS_Q8_0_AMPERE 4
+#endif
+#define  MMQ_X_Q8_0_PASCAL 64
+#define  MMQ_Y_Q8_0_PASCAL 64
+#define NWARPS_Q8_0_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q8_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+    const int nwarps = NWARPS_Q8_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+    const int nwarps = NWARPS_Q8_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+    const int nwarps = NWARPS_Q8_0_AMPERE;
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+    const int nwarps = NWARPS_Q8_0_PASCAL;
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q8_0_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q2_K_RDNA2  64
+#define  MMQ_Y_Q2_K_RDNA2  128
+#define NWARPS_Q2_K_RDNA2  8
+#define  MMQ_X_Q2_K_RDNA1  128
+#define  MMQ_Y_Q2_K_RDNA1  32
+#define NWARPS_Q2_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q2_K_AMPERE 4
+#define  MMQ_Y_Q2_K_AMPERE 32
+#define NWARPS_Q2_K_AMPERE 4
+#else
+#define  MMQ_X_Q2_K_AMPERE 64
+#define  MMQ_Y_Q2_K_AMPERE 128
+#define NWARPS_Q2_K_AMPERE 4
+#endif
+#define  MMQ_X_Q2_K_PASCAL 64
+#define  MMQ_Y_Q2_K_PASCAL 64
+#define NWARPS_Q2_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q2_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+    const int nwarps = NWARPS_Q2_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+    const int nwarps = NWARPS_Q2_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+    const int nwarps = NWARPS_Q2_K_AMPERE;
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+    const int nwarps = NWARPS_Q2_K_PASCAL;
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q2_K_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q3_K_RDNA2  128
+#define  MMQ_Y_Q3_K_RDNA2  64
+#define NWARPS_Q3_K_RDNA2  8
+#define  MMQ_X_Q3_K_RDNA1  32
+#define  MMQ_Y_Q3_K_RDNA1  128
+#define NWARPS_Q3_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q3_K_AMPERE 4
+#define  MMQ_Y_Q3_K_AMPERE 32
+#define NWARPS_Q3_K_AMPERE 4
+#else
+#define  MMQ_X_Q3_K_AMPERE 128
+#define  MMQ_Y_Q3_K_AMPERE 128
+#define NWARPS_Q3_K_AMPERE 4
+#endif
+#define  MMQ_X_Q3_K_PASCAL 64
+#define  MMQ_Y_Q3_K_PASCAL 64
+#define NWARPS_Q3_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q3_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+    const int nwarps = NWARPS_Q3_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+    const int nwarps = NWARPS_Q3_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+    const int nwarps = NWARPS_Q3_K_AMPERE;
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+    const int nwarps = NWARPS_Q3_K_PASCAL;
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q3_K_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_K_RDNA2  64
+#define  MMQ_Y_Q4_K_RDNA2  128
+#define NWARPS_Q4_K_RDNA2  8
+#define  MMQ_X_Q4_K_RDNA1  32
+#define  MMQ_Y_Q4_K_RDNA1  64
+#define NWARPS_Q4_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_K_AMPERE 4
+#define  MMQ_Y_Q4_K_AMPERE 32
+#define NWARPS_Q4_K_AMPERE 4
+#else
+#define  MMQ_X_Q4_K_AMPERE 64
+#define  MMQ_Y_Q4_K_AMPERE 128
+#define NWARPS_Q4_K_AMPERE 4
+#endif
+#define  MMQ_X_Q4_K_PASCAL 64
+#define  MMQ_Y_Q4_K_PASCAL 64
+#define NWARPS_Q4_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+    const int nwarps = NWARPS_Q4_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+    const int nwarps = NWARPS_Q4_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+    const int nwarps = NWARPS_Q4_K_AMPERE;
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+    const int nwarps = NWARPS_Q4_K_PASCAL;
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_K_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_K_RDNA2  64
+#define  MMQ_Y_Q5_K_RDNA2  128
+#define NWARPS_Q5_K_RDNA2  8
+#define  MMQ_X_Q5_K_RDNA1  32
+#define  MMQ_Y_Q5_K_RDNA1  64
+#define NWARPS_Q5_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_K_AMPERE 4
+#define  MMQ_Y_Q5_K_AMPERE 32
+#define NWARPS_Q5_K_AMPERE 4
+#else
+#define  MMQ_X_Q5_K_AMPERE 64
+#define  MMQ_Y_Q5_K_AMPERE 128
+#define NWARPS_Q5_K_AMPERE 4
+#endif
+#define  MMQ_X_Q5_K_PASCAL 64
+#define  MMQ_Y_Q5_K_PASCAL 64
+#define NWARPS_Q5_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+    const int nwarps = NWARPS_Q5_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+    const int nwarps = NWARPS_Q5_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+    const int nwarps = NWARPS_Q5_K_AMPERE;
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+    const int nwarps = NWARPS_Q5_K_PASCAL;
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_K_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q6_K_RDNA2  64
+#define  MMQ_Y_Q6_K_RDNA2  128
+#define NWARPS_Q6_K_RDNA2  8
+#define  MMQ_X_Q6_K_RDNA1  32
+#define  MMQ_Y_Q6_K_RDNA1  64
+#define NWARPS_Q6_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q6_K_AMPERE 4
+#define  MMQ_Y_Q6_K_AMPERE 32
+#define NWARPS_Q6_K_AMPERE 4
+#else
+#define  MMQ_X_Q6_K_AMPERE 64
+#define  MMQ_Y_Q6_K_AMPERE 64
+#define NWARPS_Q6_K_AMPERE 4
+#endif
+#define  MMQ_X_Q6_K_PASCAL 64
+#define  MMQ_Y_Q6_K_PASCAL 64
+#define NWARPS_Q6_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q6_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::stream &stream_ct1) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+    const int nwarps = NWARPS_Q6_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+    const int nwarps = NWARPS_Q6_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+    const int nwarps = NWARPS_Q6_K_AMPERE;
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+    const int nwarps = NWARPS_Q6_K_PASCAL;
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q6_K_q8_1_mul_mat;
+    bad_arch(stream_ct1);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
+                          const sycl::nd_item<3> &item_ct1,
+                          const sycl::stream &stream_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row * blocks_per_row + i +
+                        item_ct1.get_local_id(2) / (qi / vdr); // x block index
+
+        const int iby = (i + item_ct1.get_local_id(2) / (qi / vdr)) *
+                        (qk / QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs, stream_ct1);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:22: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
+                                   const sycl::nd_item<3> &item_ct1) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int tid = item_ct1.get_local_id(2);
+
+    const int iter_stride = 2*GGML_CUDA_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+// partial sum for each thread
+#ifdef GGML_CUDA_F16
+    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
+#else
+    float tmp = 0.0f;
+#endif // GGML_CUDA_F16
+
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel(vx, ib, iqs + j/qr, v);
+
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_CUDA_F16
+            tmp += __hmul2(v, {
+                y[iybs + iqs + j/qr + 0],
+                y[iybs + iqs + j/qr + y_offset]
+            });
+#else
+            tmp += v.x() * y[iybs + iqs + j / qr + 0];
+            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
+#endif // GGML_CUDA_F16
+        }
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:23: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+#ifdef GGML_CUDA_F16
+        dst[row] = tmp.x + tmp.y;
+#else
+        dst[row] = tmp;
+#endif // GGML_CUDA_F16
+    }
+}
+
+static void mul_mat_p021_f16_f32(
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
+    const sycl::nd_item<3> &item_ct1) {
+
+    const sycl::half *x = (const sycl::half *)vx;
+
+    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                      item_ct1.get_local_id(1);
+    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                        item_ct1.get_local_id(0);
+    const int channel_x = channel / (nchannels_y / nchannels_x);
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x;
+         col_x0 += item_ct1.get_local_range(2)) {
+        const int col_x = col_x0 + item_ct1.get_local_id(2);
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        // x is transposed and permuted
+        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
+        const float xi =
+            sycl::vec<sycl::half, 1>{x[ix]}
+                .convert<float, sycl::rounding_mode::automatic>()[0];
+
+        const int row_y = col_x;
+
+
+        // y is not transposed but permuted
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // dst is not transposed and not permuted
+    const int idst = channel*nrows_dst + row_dst;
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:24: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
+    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
+    const sycl::nd_item<3> &item_ct1) {
+
+    const sycl::half *x = (const sycl::half *)vx;
+
+    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                      item_ct1.get_local_id(1);
+    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                        item_ct1.get_local_id(0);
+    const int channel_x = channel / channel_x_divisor;
+
+    const int nrows_y   = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst   = row_x;
+
+    const int idst = channel*nrows_dst + row_dst;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x;
+         col_x0 += item_ct1.get_local_range(2)) {
+        const int col_x = col_x0 + item_ct1.get_local_id(2);
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        const int row_y = col_x;
+
+        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
+        const int iy = channel*nrows_y + row_y;
+
+        const float xi =
+            sycl::vec<sycl::half, 1>{x[ix]}
+                .convert<float, sycl::rounding_mode::automatic>()[0];
+
+        tmp += xi * y[iy];
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:25: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    sycl::half *dsti = (sycl::half *)cdsti;
+
+    *dsti = sycl::vec<float, 1>{(*xi)}
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+}
+
+static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
+    const sycl::half *xi = (const sycl::half *)cxi;
+    sycl::half *dsti = (sycl::half *)cdsti;
+
+    *dsti = *xi;
+}
+
+template <cpy_kernel_t cpy_1>
+static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
+                                   const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= ne) {
+        return;
+    }
+
+    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+    // then combine those indices with the corresponding byte offsets to get the total offsets
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = i - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = i - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q8_0 * dsti = (block_q8_0 *) cdsti;
+
+    float amax = 0.0f; // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+        const float v = xi[j];
+        amax = sycl::fmax(amax, sycl::fabs((float)v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK8_0; ++j) {
+        const float x0 = xi[j]*id;
+
+        dsti->qs[j] = sycl::round((float)x0);
+    }
+}
+
+static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_0 * dsti = (block_q4_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_0; ++j) {
+        const float v = xi[j];
+        if (amax < sycl::fabs((float)v)) {
+            amax = sycl::fabs((float)v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK4_0/2; ++j) {
+        const float x0 = xi[0       + j]*id;
+        const float x1 = xi[QK4_0/2 + j]*id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_1 * dsti = (block_q4_1 *) cdsti;
+
+    float vmin = FLT_MAX;
+    float vmax = -FLT_MAX;
+
+    for (int j = 0; j < QK4_1; ++j) {
+        const float v = xi[j];
+
+        if (v < vmin) vmin = v;
+        if (v > vmax) vmax = v;
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->dm.x() = d;
+    dsti->dm.y() = vmin;
+
+    for (int j = 0; j < QK4_1/2; ++j) {
+        const float x0 = (xi[0       + j] - vmin)*id;
+        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+template <cpy_kernel_t cpy_blck, int qk>
+static void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                 const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
+                                 const sycl::nd_item<3> &item_ct1) {
+    const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                   item_ct1.get_local_id(2)) *
+                  qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = (i - i02*ne01*ne00 - i01*ne00);
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_blck(cx + x_offset, cdst + dst_offset);
+}
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
+    return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
+}
+
+struct rope_corr_dims {
+    float v[4];
+};
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
+    }
+    *cos_theta = sycl::cos(theta) * mscale;
+    *sin_theta = sycl::sin(theta) * mscale;
+}
+
+// rope == RoPE == rotary positional embedding
+template<typename T, bool has_pos>
+static void rope(
+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims
+,
+    const sycl::nd_item<3> &item_ct1) {
+    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                         item_ct1.get_local_id(1));
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + 1];
+
+    dst[i + 0] = x0*cos_theta - x1*sin_theta;
+    dst[i + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+template<typename T, bool has_pos>
+static void rope_neox(
+    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
+,
+    const sycl::nd_item<3> &item_ct1) {
+    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                         item_ct1.get_local_id(1));
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int ib = col / n_dims;
+    const int ic = col % n_dims;
+
+    if (ib > 0) {
+        const int i = row*ncols + ib*n_dims + ic;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int i  = row*ncols + ib*n_dims + ic/2;
+    const int i2 = row/p_delta_rows;
+
+    float cur_rot = inv_ndims * ic - ib;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base =
+        p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + n_dims/2];
+
+    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+static void rope_glm_f32(
+    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    int n_ctx
+, const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int half_n_dims = ncols/4;
+
+    if (col >= half_n_dims) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
+     // FIXME: this is likely wrong
+    const int p = pos != nullptr ? pos[i2] : 0;
+
+    const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
+    const float sin_theta = sycl::sin((float)theta);
+    const float cos_theta = sycl::cos((float)theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + half_n_dims];
+
+    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
+    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
+
+    const float block_theta =
+        ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
+    const float sin_block_theta = sycl::sin((float)block_theta);
+    const float cos_block_theta = sycl::cos((float)block_theta);
+
+    const float x2 = x[i + half_n_dims * 2];
+    const float x3 = x[i + half_n_dims * 3];
+
+    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
+    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
+}
+
+static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
+                                 const int n_heads_log2_floor, const float m0, const float m1,
+                                 const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i = row*ncols + col;
+
+    const int k = row/k_rows;
+
+    float m_k;
+    if (k < n_heads_log2_floor) {
+        m_k = dpct::pow(m0, k + 1);
+    } else {
+        m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
+    }
+
+    dst[i] = col * m_k + x[i];
+}
+
+static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(1);
+    const int col = item_ct1.get_local_id(2);
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum, item_ct1);
+
+    if (col == 0) {
+        dst[row] = sum;
+    }
+}
+
+template<typename T>
+static inline void swap(T & a, T & b) {
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+template<ggml_sort_order order>
+static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
+                              const sycl::nd_item<3> &item_ct1) {
+    // bitonic sort
+    int col = item_ct1.get_local_id(2);
+    int row = item_ct1.get_group(1);
+
+    if (col >= ncols) return;
+
+    const float * x_row = x + row * ncols;
+    int * dst_row = dst + row * ncols;
+
+    // initialize indices
+    if (col < ncols) {
+        dst_row[col] = col;
+    }
+    /*
+    DPCT1065:73: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+
+    for (int k = 2; k <= ncols; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
+                        swap(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
+                        swap(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            /*
+            DPCT1118:26: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:74: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+        }
+    }
+}
+
+static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
+                              const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int i = row*ncols + col;
+    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
+    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
+}
+
+static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
+                         const sycl::nd_item<3> &item_ct1, float *buf) {
+    const int tid = item_ct1.get_local_id(2);
+    const int rowx = item_ct1.get_group(2);
+    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
+
+    const int block_size = item_ct1.get_local_range(2);
+
+    const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+    const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+
+    float max_val = -INFINITY;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+        max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
+    }
+
+    // find the max value in the block
+    max_val = warp_reduce_max(max_val, item_ct1);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf[lane_id] = -INFINITY;
+        }
+        /*
+        DPCT1118:27: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:75: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        if (lane_id == 0) {
+            buf[warp_id] = max_val;
+        }
+        /*
+        DPCT1118:28: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:76: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        max_val = buf[lane_id];
+        max_val = warp_reduce_max(max_val, item_ct1);
+    }
+
+    float tmp = 0.f;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+        const float val =
+            sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
+        tmp += val;
+        dst[ix] = val;
+    }
+
+    // find the sum of exps in the block
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf[lane_id] = 0.f;
+        }
+        /*
+        DPCT1118:29: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:77: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        if (lane_id == 0) {
+            buf[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:30: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:78: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        tmp = buf[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    const float inv_tmp = 1.f / tmp;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int i = rowx*ncols + col;
+        dst[i] *= inv_tmp;
+    }
+}
+
+static void scale_f32(const float * x, float * dst, const float scale, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = scale * x[i];
+}
+
+static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
+}
+
+static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
+                           int IW, int IH, int OW, int KW, int KH,
+                           int pelements, int CHW, int s0, int s1, int p0,
+                           int p1, int d0, int d1,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_id(2) +
+                  item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (i >= pelements) {
+        return;
+    }
+
+    const int ksize = OW * (KH > 1 ? KW : 1);
+    const int kx = i / ksize;
+    const int kd = kx * ksize;
+    const int ky = (i - kd) / OW;
+    const int ix = i % OW;
+
+    const int64_t iiw = ix * s0 + kx * d0 - p0;
+    const int64_t iih = item_ct1.get_group(1) * s1 + ky * d1 - p1;
+
+    const int64_t offset_dst =
+        (item_ct1.get_group(1) * OW + ix) * CHW +
+        (item_ct1.get_group(0) * (KW * KH) + ky * KW + kx);
+
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] =
+            sycl::vec<float, 1>{0.0f}
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+    } else {
+        const int64_t offset_src = item_ct1.get_group(0) * offset_delta;
+        dst[offset_dst] =
+            sycl::vec<float, 1>{x[offset_src + iih * IW + iiw]}
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_cuda(const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst, const void *src0_dd,
+                          const int32_t *src1_dd, float *dst_dd,
+                          dpct::queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    GGML_ASSERT(ne00 % 2 == 0);
+
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             k_get_rows<qk, qr, dq>(
+                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+                         });
+
+    (void) dst;
+}
+
+template <typename src0_t>
+static void get_rows_cuda_float(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const src0_t *src0_dd, const int32_t *src1_dd,
+                                float *dst_dd, dpct::queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+            });
+    }
+
+    (void) dst;
+}
+
+template<float (*bin_op)(const float, const float)>
+struct bin_bcast_cuda {
+    template <typename src0_t, typename src1_t, typename dst_t>
+    void operator()(const struct ggml_tensor *src0,
+                    const struct ggml_tensor *src1, struct ggml_tensor *dst,
+                    const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
+                    dpct::queue_ptr stream) {
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        int nr0 = ne10/ne0;
+        int nr1 = ne11/ne1;
+        int nr2 = ne12/ne2;
+        int nr3 = ne13/ne3;
+
+        int nr[4] = { nr0, nr1, nr2, nr3 };
+
+        // collapse dimensions until first broadcast dimension
+        int64_t cne0[] = {ne0, ne1, ne2, ne3};
+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
+        size_t cnb0[] = {nb0, nb1, nb2, nb3};
+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+        auto collapse = [](int64_t cne[]) {
+            cne[0] *= cne[1];
+            cne[1] = cne[2];
+            cne[2] = cne[3];
+            cne[3] = 1;
+        };
+
+        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
+            cnb[1] *= cne[1];
+            cnb[2] *= cne[2];
+            cnb[3] *= cne[3];
+        };
+
+        for (int i = 0; i < 4; i++) {
+            if (nr[i] != 1) {
+                break;
+            }
+            if (i > 0) {
+                collapse_nb(cnb0, cne0);
+                collapse_nb(cnb1, cne1);
+                collapse(cne0);
+                collapse(cne1);
+            }
+        }
+        {
+            int64_t ne0 = cne0[0];
+            int64_t ne1 = cne0[1];
+            int64_t ne2 = cne0[2];
+            int64_t ne3 = cne0[3];
+
+            int64_t ne10 = cne1[0];
+            int64_t ne11 = cne1[1];
+            int64_t ne12 = cne1[2];
+            int64_t ne13 = cne1[3];
+
+            size_t nb0 = cnb0[0];
+            size_t nb1 = cnb0[1];
+            size_t nb2 = cnb0[2];
+            size_t nb3 = cnb0[3];
+
+            size_t nb10 = cnb1[0];
+            size_t nb11 = cnb1[1];
+            size_t nb12 = cnb1[2];
+            size_t nb13 = cnb1[3];
+
+            size_t s0 = nb0 / sizeof(dst_t);
+            size_t s1 = nb1 / sizeof(dst_t);
+            size_t s2 = nb2 / sizeof(dst_t);
+            size_t s3 = nb3 / sizeof(dst_t);
+
+            size_t s10 = nb10 / sizeof(src1_t);
+            size_t s11 = nb11 / sizeof(src1_t);
+            size_t s12 = nb12 / sizeof(src1_t);
+            size_t s13 = nb13 / sizeof(src1_t);
+
+            GGML_ASSERT(s0 == 1);
+            GGML_ASSERT(s10 == 1);
+
+            const int block_size = 128;
+
+            int64_t hne0 = std::max(ne0/2LL, 1LL);
+
+            sycl::range<3> block_dims(1, 1, 1);
+            block_dims[2] = std::min<unsigned int>(hne0, block_size);
+            block_dims[1] =
+                std::min<unsigned int>(ne1, block_size / block_dims[2]);
+            block_dims[0] = std::min(
+                std::min<unsigned int>(ne2 * ne3, block_size / block_dims[2] /
+                                                      block_dims[1]),
+                64U);
+
+            sycl::range<3> block_nums(
+                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
+                (ne1 + block_dims[1] - 1) / block_dims[1],
+                (hne0 + block_dims[2] - 1) / block_dims[2]);
+
+            if (block_nums[0] > 65535) {
+                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
+                {
+                    dpct::has_capability_or_fail(stream->get_device(),
+                                                 {sycl::aspect::fp16});
+                    stream->parallel_for(
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
+                                              sycl::range<3>(1, 1, block_size),
+                                          sycl::range<3>(1, 1, block_size)),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            k_bin_bcast_unravel<bin_op>(
+                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
+                                ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
+                                s13, item_ct1);
+                        });
+                }
+            } else {
+                /*
+                DPCT1049:31: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                dpct::has_capability_or_fail(stream->get_device(),
+                                             {sycl::aspect::fp16});
+                stream->parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
+                                            ne2, ne3, ne10, ne11, ne12, ne13,
+                                            s1, s2, s3, s11, s12, s13,
+                                            item_ct1);
+                    });
+            }
+        }
+    }
+};
+
+static void acc_f32_cuda(const float *x, const float *y, float *dst,
+                         const int n_elements, const int ne10, const int ne11,
+                         const int ne12, const int nb1, const int nb2,
+                         const int offset, dpct::queue_ptr stream) {
+    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
+                    item_ct1);
+        });
+}
+
+static void gelu_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void silu_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            silu_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void gelu_quick_f32_cuda(const float *x, float *dst, const int k,
+                                dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_quick_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void tanh_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            tanh_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void relu_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            relu_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void leaky_relu_f32_cuda(const float *x, float *dst, const int k,
+                                const float negative_slope,
+                                dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
+        });
+}
+
+static void sqr_f32_cuda(const float *x, float *dst, const int k,
+                         dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sqr_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void norm_f32_cuda(const float *x, float *dst, const int ncols,
+                          const int nrows, const float eps,
+                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
+                sycl::range<1>(32), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
+                                            s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    } else {
+        const sycl::range<3> block_dims(1, 1, 1024);
+        /*
+        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
+                sycl::range<1>(32), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        norm_f32<1024>(x, dst, ncols, eps, item_ct1,
+                                       s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    }
+}
+
+static void group_norm_f32_cuda(const float *x, float *dst,
+                                const int num_groups, const int group_size,
+                                const int ne_elements, dpct::queue_ptr stream) {
+    static const float eps = 1e-6f;
+    if (group_size < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            const float eps_ct4 = eps;
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        group_norm_f32<WARP_SIZE>(
+                            x, dst, group_size, ne_elements, eps_ct4, item_ct1,
+                            s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    } else {
+        const sycl::range<3> block_dims(1, 1, 1024);
+        /*
+        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            const float eps_ct4 = eps;
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        group_norm_f32<1024>(x, dst, group_size, ne_elements,
+                                             eps_ct4, item_ct1,
+                                             s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    }
+}
+
+static void concat_f32_cuda(const float *x, const float *y, float *dst,
+                            const int ne0, int ne1, int ne2, int ne02,
+                            dpct::queue_ptr stream) {
+    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne2, ne1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            concat_f32(x, y, dst, ne0, ne02, item_ct1);
+        });
+}
+
+static void upscale_f32_cuda(const float *x, float *dst, const int ne00,
+                             const int ne01, const int ne02,
+                             const int scale_factor, dpct::queue_ptr stream) {
+    int ne0 = (ne00 * scale_factor);
+    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
+        });
+}
+
+static void pad_f32_cuda(const float *x, float *dst, const int ne00,
+                         const int ne01, const int ne02, const int ne0,
+                         const int ne1, const int ne2, dpct::queue_ptr stream) {
+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne2, ne1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
+        });
+}
+
+static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
+                              const int nrows, const float eps,
+                              dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        rms_norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
+                                                s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    } else {
+        const sycl::range<3> block_dims(1, 1, 1024);
+        /*
+        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1,
+                                           s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    }
+}
+
+static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx,
+                                   const int ky, const int kx_padded,
+                                   dpct::queue_ptr stream) {
+    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const sycl::range<3> num_blocks(1, ky, block_num_x);
+    const sycl::range<3> block_size(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(num_blocks * block_size, block_size),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
+            });
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block_cuda(const void *__restrict__ vx,
+                                  dst_t *__restrict__ y, const int k,
+                                  dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(
+                sycl::range<3>(1, 1, num_blocks) *
+                    sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE),
+                sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
+            });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q2_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q3_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_K(vx, y, item_ct1);
+                             });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q5_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q6_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_TYPE_Q4_1:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_F32:
+            return dequantize_block_cuda<1, 1, convert_f32>;
+        default:
+            return nullptr;
+    }
+}
+
+static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_TYPE_Q4_1:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_F16:
+            return dequantize_block_cuda<1, 1, convert_f16>;
+        default:
+            return nullptr;
+    }
+}
+
+static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q2_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q3_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q4_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q5_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const sycl::range<3> block_dims(1, 1, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q6_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y,
+                                         float *dst, const int ncols,
+                                         const int nrows,
+                                         dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
+                                                          nrows, item_ct1);
+            });
+    }
+}
+
+static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
+                              vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
+                              vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK5_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
+                              vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK5_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
+                              vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK8_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
+                              vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
+                              vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
+                              vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
+                              vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
+                              vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->submit([&](sycl::handler &cgh) {
+        sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
+                              vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                                 item_ct1, stream_ct1);
+            });
+    });
+}
+
+static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+        nwarps = NWARPS_Q4_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+        nwarps = NWARPS_Q4_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+        nwarps = NWARPS_Q4_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+        nwarps = NWARPS_Q4_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+        nwarps = NWARPS_Q4_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+        nwarps = NWARPS_Q4_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+        nwarps = NWARPS_Q4_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+        nwarps = NWARPS_Q4_1_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+        nwarps = NWARPS_Q5_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+        nwarps = NWARPS_Q5_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+        nwarps = NWARPS_Q5_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+        nwarps = NWARPS_Q5_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+        nwarps = NWARPS_Q5_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+        nwarps = NWARPS_Q5_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+        nwarps = NWARPS_Q5_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+        nwarps = NWARPS_Q5_1_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+        nwarps = NWARPS_Q8_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+        nwarps = NWARPS_Q8_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q8_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+        nwarps = NWARPS_Q8_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q8_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+        nwarps = NWARPS_Q8_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+        nwarps = NWARPS_Q2_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+        nwarps = NWARPS_Q2_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q2_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+        nwarps = NWARPS_Q2_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q2_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+        nwarps = NWARPS_Q2_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:46: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+#if QK_K == 256
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+        nwarps = NWARPS_Q3_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+        nwarps = NWARPS_Q3_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q3_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+        nwarps = NWARPS_Q3_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q3_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+        nwarps = NWARPS_Q3_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:48: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+#endif
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+        nwarps = NWARPS_Q4_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+        nwarps = NWARPS_Q4_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+        nwarps = NWARPS_Q4_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+        nwarps = NWARPS_Q4_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:49: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:50: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+        nwarps = NWARPS_Q5_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+        nwarps = NWARPS_Q5_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+        nwarps = NWARPS_Q5_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+        nwarps = NWARPS_Q5_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:51: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:52: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+        nwarps = NWARPS_Q6_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+        nwarps = NWARPS_Q6_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q6_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+        nwarps = NWARPS_Q6_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q6_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+        nwarps = NWARPS_Q6_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:53: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:54: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                [=](sycl::nd_item<3> item_ct1) {
+                    mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
+                                             ncols_y, nrows_y, nrows_dst,
+                                             stream_ct1);
+                });
+        });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y,
+                                           float *dst, const int ncols_x,
+                                           const int nrows_x,
+                                           const int nchannels_x,
+                                           const int nchannels_y,
+                                           dpct::queue_ptr stream) {
+
+    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
+                                     nchannels_y, item_ct1);
+            });
+    }
+}
+
+static void ggml_mul_mat_vec_nc_f16_f32_cuda(
+    const void *vx, const float *y, float *dst, const int ncols_x,
+    const int nrows_x, const int row_stride_x, const int nchannels_x,
+    const int nchannels_y, const int channel_stride_x, dpct::queue_ptr stream) {
+
+    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
+                                       row_stride_x, channel_stride_x,
+                                       nchannels_y / nchannels_x, item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_q8_0_cuda(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int nb00, const int nb01,
+                                   const int nb02, const int ne10,
+                                   const int ne11, const int nb10,
+                                   const int nb11, const int nb12,
+                                   dpct::queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK8_0 == 0);
+    const int num_blocks = ne / QK8_0;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
+                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
+                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
+                         });
+}
+
+static void ggml_cpy_f32_q4_0_cuda(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int nb00, const int nb01,
+                                   const int nb02, const int ne10,
+                                   const int ne11, const int nb10,
+                                   const int nb11, const int nb12,
+                                   dpct::queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK4_0 == 0);
+    const int num_blocks = ne / QK4_0;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
+                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
+                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
+                         });
+}
+
+static void ggml_cpy_f32_q4_1_cuda(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int nb00, const int nb01,
+                                   const int nb02, const int ne10,
+                                   const int ne11, const int nb10,
+                                   const int nb11, const int nb12,
+                                   dpct::queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK4_1 == 0);
+    const int num_blocks = ne / QK4_1;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
+                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
+                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
+                         });
+}
+
+static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void scale_f32_cuda(const float *x, float *dst, const float scale,
+                           const int k, dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            scale_f32(x, dst, scale, k, item_ct1);
+        });
+}
+
+static void clamp_f32_cuda(const float *x, float *dst, const float min,
+                           const float max, const int k,
+                           dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            clamp_f32(x, dst, min, max, k, item_ct1);
+        });
+}
+
+template <typename T>
+static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
+                      const int32_t *pos, float freq_scale, int p_delta_rows,
+                      float freq_base, float ext_factor, float attn_factor,
+                      rope_corr_dims corr_dims, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % 2 == 0);
+    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
+    if (pos == nullptr) {
+        /*
+        DPCT1049:55: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
+                               freq_base, ext_factor, attn_factor, corr_dims,
+                               item_ct1);
+            });
+    } else {
+        /*
+        DPCT1049:56: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
+                              freq_base, ext_factor, attn_factor, corr_dims,
+                              item_ct1);
+            });
+    }
+}
+
+template <typename T>
+static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
+                           const int32_t *pos, float freq_scale,
+                           int p_delta_rows, float freq_base, float ext_factor,
+                           float attn_factor, rope_corr_dims corr_dims,
+                           dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % 2 == 0);
+    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.0f / n_dims;
+
+    if (pos == nullptr) {
+        /*
+        DPCT1049:57: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale,
+                                    p_delta_rows, ext_factor, attn_factor,
+                                    corr_dims, theta_scale, inv_ndims,
+                                    item_ct1);
+            });
+    } else {
+        /*
+        DPCT1049:58: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale,
+                                   p_delta_rows, ext_factor, attn_factor,
+                                   corr_dims, theta_scale, inv_ndims, item_ct1);
+            });
+    }
+}
+
+static void rope_glm_f32_cuda(const float *x, float *dst, int ncols, int nrows,
+                              const int32_t *pos, float freq_scale,
+                              int p_delta_rows, float freq_base, int n_ctx,
+                              dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % 4 == 0);
+    const sycl::range<3> block_dims(1, 1, CUDA_ROPE_BLOCK_SIZE / 4);
+    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
+    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             rope_glm_f32(x, dst, ncols, pos, freq_scale,
+                                          p_delta_rows, freq_base, n_ctx,
+                                          item_ct1);
+                         });
+}
+
+static void alibi_f32_cuda(const float *x, float *dst, const int ncols,
+                           const int nrows, const int k_rows,
+                           const int n_heads_log2_floor, const float m0,
+                           const float m1, dpct::queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, CUDA_ALIBI_BLOCK_SIZE);
+    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             alibi_f32(x, dst, ncols, k_rows,
+                                       n_heads_log2_floor, m0, m1, item_ct1);
+                         });
+}
+
+static void sum_rows_f32_cuda(const float *x, float *dst, const int ncols,
+                              const int nrows, dpct::queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1)
+                             [[intel::reqd_sub_group_size(32)]] {
+                                 k_sum_rows_f32(x, dst, ncols, item_ct1);
+                             });
+}
+
+static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
+                                 const int nrows, ggml_sort_order order,
+                                 dpct::queue_ptr stream) {
+    // bitonic sort requires ncols to be power of 2
+    GGML_ASSERT((ncols & (ncols - 1)) == 0);
+
+    const sycl::range<3> block_dims(1, 1, ncols);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    if (order == GGML_SORT_ASC) {
+        /*
+        DPCT1049:59: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
+            });
+    } else if (order == GGML_SORT_DESC) {
+        /*
+        DPCT1049:60: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
+            });
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+static void diag_mask_inf_f32_cuda(const float *x, float *dst,
+                                   const int ncols_x, const int nrows_x,
+                                   const int rows_per_channel, const int n_past,
+                                   dpct::queue_ptr stream) {
+    const sycl::range<3> block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
+    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
+    const sycl::range<3> block_nums(1, block_num_x, nrows_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             diag_mask_inf_f32(x, dst, ncols_x,
+                                               rows_per_channel, n_past,
+                                               item_ct1);
+                         });
+}
+
+static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
+                              const int ncols_x, const int nrows_x,
+                              const int nrows_y, const float scale,
+                              dpct::queue_ptr stream) {
+    int nth = WARP_SIZE;
+    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    const sycl::range<3> block_dims(1, 1, nth);
+    const sycl::range<3> block_nums(1, 1, nrows_x);
+    /*
+    DPCT1049:61: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    stream->submit([&](sycl::handler &cgh) {
+        /*
+        DPCT1101:111: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
+        replaced with a value. Modify the code to use the original expression,
+        provided in comments, if it is correct.
+        */
+        sycl::local_accessor<float, 1> buf_acc_ct1(
+            sycl::range<1>(32 /*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
+                             buf_acc_ct1.get_pointer());
+            });
+    });
+}
+
+static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH,
+                                int OW, int OH, int KW, int KH, int IC,
+                                int offset_delta, int s0, int s1, int p0,
+                                int p1, int d0, int d1,
+                                dpct::queue_ptr stream) {
+    const int parallel_elements = OW * KW * KH;
+    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
+    sycl::range<3> block_nums(IC, OH, num_blocks);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums *
+                                  sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH,
+                               parallel_elements, (IC * KH * KW), s0, s1, p0,
+                               p1, d0, d1, item_ct1);
+            });
+    }
+}
+
+// buffer pool for cuda
+#define MAX_CUDA_BUFFERS 256
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+// #define DEBUG_CUDA_MALLOC
+struct cuda_buffer {
+    void * ptr = nullptr;
+    size_t size = 0;
+};
+
+static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
+static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
+
+static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+#ifdef DEBUG_CUDA_MALLOC
+    int nnz = 0;
+    size_t max_size = 0;
+#endif
+    size_t best_diff = 1ull << 36;
+    int ibest = -1;
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+        if (b.ptr != nullptr) {
+#ifdef DEBUG_CUDA_MALLOC
+            ++nnz;
+            if (b.size > max_size) max_size = b.size;
+#endif
+            if (b.size >= size) {
+                size_t diff = b.size - size;
+                if (diff < best_diff) {
+                    best_diff = diff;
+                    ibest = i;
+                    if (!best_diff) {
+                        void * ptr = b.ptr;
+                        *actual_size = b.size;
+                        b.ptr = nullptr;
+                        b.size = 0;
+                        return ptr;
+                    }
+                }
+            }
+        }
+    }
+    if (ibest >= 0) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
+        void * ptr = b.ptr;
+        *actual_size = b.size;
+        b.ptr = nullptr;
+        b.size = 0;
+        return ptr;
+    }
+    void * ptr;
+    size_t look_ahead_size = (size_t) (1.05 * size);
+    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(
+                             look_ahead_size, dpct::get_in_order_queue())));
+    *actual_size = look_ahead_size;
+    g_cuda_pool_size[id] += look_ahead_size;
+#ifdef DEBUG_CUDA_MALLOC
+    fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
+            (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
+#endif
+    return ptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+        if (b.ptr == nullptr) {
+            b.ptr = ptr;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+    g_cuda_pool_size[id] -= size;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+#if !defined(GGML_USE_HIPBLAS)
+// pool with virtual memory
+/*
+DPCT1082:79: Migration of CUmemGenericAllocationHandle type is not supported.
+*/
+// static std::vector<CUmemGenericAllocationHandle>
+//     g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
+static dpct::device_ptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
+static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
+static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
+
+static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+
+    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
+    const size_t alignment = 128;
+    size = alignment * ((size + alignment - 1) / alignment);
+
+    size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
+
+    if (size > avail) {
+        // round up to the next multiple of the granularity
+        size_t reserve_size = size - avail;
+        const size_t granularity = g_device_caps[id].vmm_granularity;
+        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
+
+        GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
+
+        // allocate more physical memory
+        /*
+        DPCT1082:80: Migration of CUmemAllocationProp type is not supported.
+        */
+        CUmemAllocationProp prop = {};
+        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id = id;
+        /*
+        DPCT1082:81: Migration of CUmemGenericAllocationHandle type is not
+        supported.
+        */
+        // CUmemGenericAllocationHandle handle;
+        /*
+        DPCT1007:84: Migration of cuMemCreate is not supported.
+        */
+        // CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
+
+        // reserve virtual address space (if not already reserved)
+        if (g_cuda_pool_addr[id] == 0) {
+            /*
+            DPCT1007:85: Migration of cuMemAddressReserve is not supported.
+            */
+            // CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id],
+            //                              CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+        }
+
+        // map at the end of the pool
+        /*
+        DPCT1007:86: Migration of cuMemMap is not supported.
+        */
+        // CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
+        //                   reserve_size, 0, handle, 0));
+
+        // set access
+        /*
+        DPCT1082:87: Migration of CUmemAccessDesc type is not supported.
+        */
+        CUmemAccessDesc access = {};
+        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        access.location.id = id;
+        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        /*
+        DPCT1007:88: Migration of cuMemSetAccess is not supported.
+        */
+        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
+                                reserve_size, &access, 1));
+
+        // add to the pool
+        // g_cuda_pool_handles[id].push_back(handle);
+        g_cuda_pool_size[id] += reserve_size;
+
+        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
+        //       (unsigned long long) (reserve_size/1024/1024));
+    }
+
+    GGML_ASSERT(g_cuda_pool_addr[id] != 0);
+
+    void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]);
+    *actual_size = size;
+    g_cuda_pool_used[id] += size;
+
+#ifdef DEBUG_CUDA_MALLOC
+    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
+#endif
+
+    return ptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+
+#ifdef DEBUG_CUDA_MALLOC
+    printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
+#endif
+
+    g_cuda_pool_used[id] -= size;
+
+    // all deallocations must be in reverse order of the allocations
+    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
+    int id;
+
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    if (g_device_caps[id].vmm) {
+        return ggml_cuda_pool_malloc_vmm(size, actual_size);
+    } else {
+        return ggml_cuda_pool_malloc_leg(size, actual_size);
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_pool_free(void *ptr, size_t size) try {
+    int id;
+    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    if (g_device_caps[id].vmm) {
+        ggml_cuda_pool_free_vmm(ptr, size);
+    } else {
+        ggml_cuda_pool_free_leg(ptr, size);
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+#else
+#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
+#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
+#endif // !defined(GGML_USE_HIPBLAS)
+
+template<typename T>
+struct cuda_pool_alloc {
+    T * ptr = nullptr;
+    size_t actual_size = 0;
+
+    // size is in number of elements
+    T * alloc(size_t size) {
+        GGML_ASSERT(ptr == nullptr);
+        ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size);
+        return ptr;
+    }
+
+    cuda_pool_alloc(size_t size) {
+        alloc(size);
+    }
+
+    ~cuda_pool_alloc() {
+        if (ptr != nullptr) {
+            ggml_cuda_pool_free(ptr, actual_size);
+        }
+    }
+
+    T * get() {
+        return ptr;
+    }
+
+    cuda_pool_alloc() = default;
+    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
+    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
+    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
+    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
+};
+
+static bool g_cublas_loaded = false;
+
+bool ggml_cublas_loaded(void) {
+    return g_cublas_loaded;
+}
+
+void print_devices(int device_count){
+    for (int id = 0; id < device_count; ++id) {
+        dpct::device_info prop;
+        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(id))));
+
+        fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id,
+                prop.get_name(), prop.get_major_version(),
+                prop.get_minor_version());
+    }
+}
+
+int get_env_value(const char *env_name, int default_val){
+    char * user_device_string = getenv(env_name);
+    int user_device_number = -1;
+
+    unsigned n;
+    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < g_device_count) {
+        user_device_number = (int)n;
+    } else {
+        user_device_number=default_val;
+    }
+}
+void ggml_init_cublas() try {
+    static bool initialized = false;
+
+    if (!initialized) {
+
+#ifdef __HIP_PLATFORM_AMD__
+        // Workaround for a rocBLAS bug when using multiple graphics cards:
+        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+        rocblas_initialize();
+        CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+
+        g_device_count = dpct::dev_mgr::instance().device_count();
+        if (DPCT_CHECK_ERROR(g_device_count != 0)) {
+            initialized = true;
+            g_cublas_loaded = false;
+            return;
+        }
+
+        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
+        int64_t total_vram = 0;
+#if defined(GGML_CUDA_FORCE_MMQ)
+        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
+#endif
+#if defined(CUDA_USE_TENSOR_CORES)
+        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
+#endif
+        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
+        print_devices(g_device_count);
+
+	//zjy hardcode, force set to 1 device
+        g_device_count = 1;
+
+        for (int id = 0; id < g_device_count; ++id) {
+            int device_vmm = 0;
+
+#if !defined(GGML_USE_HIPBLAS)
+            //int device;
+            //CU_CHECK(DPCT_CHECK_ERROR(device = id));
+            /*
+            DPCT1028:89: The cuDeviceGetAttribute was not migrated because
+            parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is
+            unsupported.
+            */
+            /*CU_CHECK(cuDeviceGetAttribute(
+                &device_vmm,
+                CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
+                device));
+            */
+            //if (device_vmm) {
+                /*
+                DPCT1082:90: Migration of CUmemAllocationProp type is not
+                supported.
+                */
+                //CUmemAllocationProp alloc_prop = {};
+                //alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+                //alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                //alloc_prop.location.id = id;
+                /*
+                DPCT1007:91: Migration of cuMemGetAllocationGranularity is not
+                supported.
+                */
+                //CU_CHECK(cuMemGetAllocationGranularity(
+                //    &g_device_caps[id].vmm_granularity, &alloc_prop,
+                //    CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+            //}
+#endif // !defined(GGML_USE_HIPBLAS)
+            g_device_caps[id].vmm = !!device_vmm;
+
+            dpct::device_info prop;
+            dpct::get_device_info(
+                prop, dpct::dev_mgr::instance().get_device(id))；
+
+            // CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+            //     prop, dpct::dev_mgr::instance().get_device(id))));
+            /*
+            DPCT1005:92: The SYCL device version is different from CUDA Compute
+            Compatibility. You may need to rewrite this code.
+            */
+            fprintf(stderr,
+                    "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id,
+                    prop.get_name(), prop.get_major_version(),
+                    prop.get_minor_version(), device_vmm ? "yes" : "no");
+
+            g_tensor_split[id] = total_vram;
+            total_vram += prop.get_global_mem_size();
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
+#else
+            /*
+            DPCT1005:93: The SYCL device version is different from CUDA Compute
+            Compatibility. You may need to rewrite this code.
+            */
+            g_device_caps[id].cc =
+                100 * prop.get_major_version() + 10 * prop.get_minor_version();
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+        }
+
+        int user_device_number = get_env_value("GGML_SYCL_DEVICE", 0);
+
+        for (int id = 0; id < g_device_count; ++id) {
+            g_tensor_split[id] /= total_vram;
+        }
+
+        for (int id = 0; id < g_device_count; ++id) {
+            ggml_cuda_set_device(id)；
+            // CUDA_CHECK(ggml_cuda_set_device(id));
+
+            // create cuda streams
+            for (int is = 0; is < MAX_STREAMS; ++is) {
+                /*
+                DPCT1025:105: The SYCL queue is created ignoring the flag and
+                priority options.
+                */
+                g_cudaStreams[id][is] =
+                        dpct::get_current_device().create_queue()；
+                // CUDA_CHECK(DPCT_CHECK_ERROR(
+                //     g_cudaStreams[id][is] =
+                //         dpct::get_current_device().create_queue()));
+            }
+
+            // create cublas handle
+            g_cublas_handles[id] = &dpct::get_in_order_queue();
+            // CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
+            //                                   &dpct::get_in_order_queue()));
+            /*
+            DPCT1027:107: The call to cublasSetMathMode was replaced with 0
+            because this call is redundant in SYCL.
+            */
+            CUBLAS_CHECK(0);
+        }
+
+        // configure logging to stdout
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+
+
+        ggml_cuda_set_device(user_device_number);
+        fprintf(stderr, "  set Device %d\n", user_device_number);
+
+        initialized = true;
+        g_cublas_loaded = true;
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_set_tensor_split(const float * tensor_split) {
+    if (tensor_split == nullptr) {
+        return;
+    }
+    bool all_zero = true;
+    for (int i = 0; i < g_device_count; ++i) {
+        if (tensor_split[i] != 0.0f) {
+            all_zero = false;
+            break;
+        }
+    }
+    if (all_zero) {
+        return;
+    }
+    float split_sum = 0.0f;
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] = split_sum;
+        split_sum += tensor_split[i];
+    }
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] /= split_sum;
+    }
+}
+
+void *ggml_cuda_host_malloc(size_t size) try {
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    dpct::err0 err = DPCT_CHECK_ERROR(
+        ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
+    /*
+    DPCT1000:97: Error handling if-stmt was detected but could not be rewritten.
+    */
+    if (err != 0) {
+        // clear the error
+        /*
+        DPCT1026:98: The call to cudaGetLastError was removed because this call
+        is redundant in SYCL.
+        */
+        /*
+        DPCT1001:96: The statement could not be removed.
+        */
+        fprintf(
+            stderr,
+            "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            /*
+            DPCT1009:99: SYCL uses exceptions to report errors and does not use
+            the error codes. The original code was commented out and a warning
+            string was inserted. You need to rewrite this code.
+            */
+            size / 1024.0 / 1024.0,
+            "cudaGetErrorString is not supported" /*cudaGetErrorString(err)*/);
+        return nullptr;
+    }
+
+    return ptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_host_free(void *ptr) try {
+    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
+                                          const struct ggml_tensor *src,
+                                          int64_t i3, int64_t i2,
+                                          int64_t i1_low, int64_t i1_high,
+                                          dpct::queue_ptr stream) try {
+
+    dpct::memcpy_direction kind;
+    char * src_ptr;
+    if (src->backend == GGML_BACKEND_CPU) {
+        kind = dpct::host_to_device;
+        src_ptr = (char *) src->data;
+    } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
+        GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
+        kind = dpct::device_to_device;
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
+        int id;
+        CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+        src_ptr = (char *) extra->data_device[id];
+    } else {
+        GGML_ASSERT(false);
+    }
+    char * dst_ptr = (char *) dst;
+
+    const int64_t ne0 = src->ne[0];
+    const int64_t nb0 = src->nb[0];
+    const int64_t nb1 = src->nb[1];
+    const int64_t nb2 = src->nb[2];
+    const int64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const int64_t ts = ggml_type_size(type);
+    const int64_t bs = ggml_blck_size(type);
+    int64_t i1_diff = i1_high - i1_low;
+
+    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        return DPCT_CHECK_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
+    } else if (nb0 == ts) {
+        return DPCT_CHECK_ERROR(
+            dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1,
+                                    ts * ne0 / bs, i1_diff, kind, *stream));
+    } else {
+        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
+            const void * rx = (const void *) ((const char *) x + i1*nb1);
+            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
+            // pretend the row is a matrix with cols=1
+            dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
+                rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
+            /*
+            DPCT1001:100: The statement could not be removed.
+            */
+            /*
+            DPCT1000:101: Error handling if-stmt was detected but could not be
+            rewritten.
+            */
+            if (r != 0) return r;
+        }
+        return 0;
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_op_get_rows(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_d, const float *src1_d,
+                                  float *dst_d, const dpct::queue_ptr &stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
+    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
+
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            get_rows_cuda_float(src0, src1, dst, (const sycl::half *)src0_d,
+                                src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_F32:
+            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        default:
+            // TODO: k-quants
+            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
+template <class op>
+inline void ggml_cuda_op_bin_bcast(const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd,
+                                   const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
+             (sycl::half *)dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
+             main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ASSERT(false);
+    }
+}
+
+static void ggml_cuda_op_repeat(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_d, const float *src1_d,
+                                float *dst_d,
+                                const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
+
+    (void) src1;
+    (void) src1_d;
+}
+
+inline void ggml_cuda_op_add(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_cuda_op_acc(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
+
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes
+
+    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
+
+    (void) dst;
+}
+
+inline void ggml_cuda_op_mul(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_cuda_op_div(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_cuda_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_silu(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_gelu_quick(const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_relu(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_leaky_relu(const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_norm(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_group_norm(const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int num_groups = dst->op_params[0];
+    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
+    group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_concat(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_dd, const float *src1_dd,
+                                float *dst_dd,
+                                const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
+    }
+
+    (void) src1;
+    (void) dst;
+}
+
+inline void ggml_cuda_op_upscale(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    const int scale_factor = dst->op_params[0];
+
+    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    pad_f32_cuda(src0_dd, dst_dd,
+        src0->ne[0], src0->ne[1], src0->ne[2],
+        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_rms_norm(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_mul_mat_q(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) try {
+
+    const int64_t ne00 = src0->ne[0];
+
+    const int64_t ne10 = src1->ne[0];
+    GGML_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
+    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static int64_t get_row_rounding(ggml_type type) {
+    int64_t min_compute_capability = INT_MAX;
+    int64_t max_compute_capability = INT_MIN;
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            if (min_compute_capability > g_device_caps[id].cc) {
+                min_compute_capability = g_device_caps[id].cc;
+            }
+            if (max_compute_capability < g_device_caps[id].cc) {
+                max_compute_capability = g_device_caps[id].cc;
+            }
+        }
+    }
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            return 1;
+        case GGML_TYPE_Q2_K:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
+        case GGML_TYPE_Q3_K:
+            return min_compute_capability < CC_RDNA2 ? 128 : 64;
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        default:
+            GGML_ASSERT(false);
+    }
+#else
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return 64;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            return 1;
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_TYPE_Q6_K:
+            return 64;
+        default:
+            GGML_ASSERT(false);
+    }
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+}
+
+inline void ggml_cuda_op_mul_mat_vec_q(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) {
+
+    GGML_ASSERT(ggml_nrows(src1) == 1);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+inline void ggml_cuda_op_dequantize_mul_mat_vec(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
+#ifdef GGML_CUDA_F16
+    cuda_pool_alloc<half> src1_dfloat_a;
+    half * src1_dfloat = nullptr; // dfloat == half
+
+    bool src1_convert_f16 =
+        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
+        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
+        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
+
+    if (src1_convert_f16) {
+        src1_dfloat = src1_dfloat_a.alloc(ne00);
+        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
+                                ne00, 1, sizeof(float), 0, 0,
+                                ne00, 1, sizeof(half),  0, 0, stream);
+    }
+#else
+    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
+#endif // GGML_CUDA_F16
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_F16:
+            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+inline void ggml_cuda_op_mul_mat_cublas(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) try {
+
+    GGML_ASSERT(src0_dd_i  != nullptr);
+    GGML_ASSERT(src1_ddf_i != nullptr);
+    GGML_ASSERT(dst_dd_i   != nullptr);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // ldc == nrows of the matrix that cuBLAS writes into
+    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    const int compute_capability = g_device_caps[id].cc;
+
+    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
+        cuda_pool_alloc<sycl::half> src0_as_f16;
+        if (src0->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = row_diff*ne00;
+            src0_as_f16.alloc(ne);
+            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
+        }
+        const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
+                                         ? (const sycl::half *)src0_dd_i
+                                         : src0_as_f16.get();
+
+        cuda_pool_alloc<sycl::half> src1_as_f16;
+        if (src1->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = src1_ncols*ne10;
+            src1_as_f16.alloc(ne);
+            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
+        }
+        const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
+                                         ? (const sycl::half *)src1_ddf_i
+                                         : src1_as_f16.get();
+        cuda_pool_alloc<sycl::half> dst_f16(row_diff * src1_ncols);
+
+        const sycl::half alpha_f16 = 1.0f;
+        const sycl::half beta_f16 = 0.0f;
+
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
+            g_cublas_handles, oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+            &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
+            src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
+            dst_f16.get(), dpct::library_data_t::real_half, ldc,
+            dpct::library_data_t::real_half)));
+
+        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+    }
+    else {
+        cuda_pool_alloc<float> src0_ddq_as_f32;
+
+        if (src0->type != GGML_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+            GGML_ASSERT(to_fp32_cuda != nullptr);
+            src0_ddq_as_f32.alloc(row_diff*ne00);
+            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
+        }
+        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
+
+        const float alpha = 1.0f;
+        const float beta = 0.0f;
+
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
+            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, alpha,
+            src0_ddf_i, ne00, src1_ddf_i, ne10, beta, dst_dd_i, ldc)));
+    }
+
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_padded_row_size;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t nrows = ggml_nrows(src0);
+
+    //const int n_past      = ((int32_t *) dst->op_params)[0];
+    const int n_dims      = ((int32_t *) dst->op_params)[1];
+    const int mode        = ((int32_t *) dst->op_params)[2];
+    const int n_ctx       = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
+
+    // RoPE alteration for extended context
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    const int32_t * pos = nullptr;
+    if ((mode & 1) == 0) {
+        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        GGML_ASSERT(src1->ne[0] == ne2);
+        pos = (const int32_t *) src1_dd;
+    }
+
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+
+    rope_corr_dims corr_dims;
+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
+
+    // compute
+    if (is_glm) {
+        GGML_ASSERT(false);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
+    } else if (is_neox) {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_neox_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_neox_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
+                           ne00, n_dims, nrows, pos, freq_scale, ne01,
+                           freq_base, ext_factor, attn_factor, corr_dims,
+                           main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
+    } else {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
+                      nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                      attn_factor, corr_dims, main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t nrows = ggml_nrows(src0);
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_head = ((int32_t *) dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+    //GGML_ASSERT(ne01 + n_past == ne00);
+    GGML_ASSERT(n_head == ne02);
+
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
+
+    (void) src1;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_im2col(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_dd, const float *src1_dd,
+                                float *dst_dd,
+                                const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+
+    const int64_t IC = src1->ne[is_2D ? 2 : 1];
+    const int64_t IH = is_2D ? src1->ne[1] : 1;
+    const int64_t IW =         src1->ne[0];
+
+    const int64_t KH = is_2D ? src0->ne[1] : 1;
+    const int64_t KW =         src0->ne[0];
+
+    const int64_t OH = is_2D ? dst->ne[2] : 1;
+    const int64_t OW =         dst->ne[1];
+
+    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+
+    im2col_f32_f16_cuda(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
+                        IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
+
+    (void) src0;
+    (void) src0_dd;
+}
+
+inline void ggml_cuda_op_sum_rows(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_argsort(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_diag_mask_inf(const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst, const float *src0_dd,
+                                       const float *src1_dd, float *dst_dd,
+                                       const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int nrows0 = ggml_nrows(src0);
+
+    const int n_past = ((int32_t *) dst->op_params)[0];
+
+    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_soft_max(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows_x = ggml_nrows(src0);
+    const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
+
+    float scale = 1.0f;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
+
+    (void) dst;
+}
+
+inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
+    /*
+    DPCT1010:102: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    CUDA_CHECK(0);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
+    /*
+    DPCT1010:103: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    CUDA_CHECK(0);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_cuda_op_flatten(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const ggml_cuda_op_flatten_t op) try {
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const bool use_src1 = src1 != nullptr;
+    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
+
+    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
+
+    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
+    const bool  dst_on_device =              dst->backend == GGML_BACKEND_GPU;
+
+    // dd = data device
+    float * src0_ddf = nullptr;
+    float * src1_ddf = nullptr;
+    float *  dst_ddf = nullptr;
+
+    cuda_pool_alloc<float> src0_f;
+    cuda_pool_alloc<float> src1_f;
+    cuda_pool_alloc<float>  dst_f;
+
+    ggml_cuda_set_device(g_main_device);
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    if (src0_on_device) {
+        src0_ddf = (float *) src0_extra->data_device[g_main_device];
+    } else {
+        src0_ddf = src0_f.alloc(ggml_nelements(src0));
+        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
+    }
+
+    if (use_src1) {
+        if (src1_on_device) {
+            src1_ddf = (float *) src1_extra->data_device[g_main_device];
+        } else {
+            src1_ddf = src1_f.alloc(ggml_nelements(src1));
+            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
+        }
+    }
+    if (dst_on_device) {
+        dst_ddf = (float *) dst_extra->data_device[g_main_device];
+    } else {
+        dst_ddf = dst_f.alloc(ggml_nelements(dst));
+    }
+
+    // do the computation
+    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
+    /*
+    DPCT1010:104: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    CUDA_CHECK(0);
+
+    // copy dst to host if necessary
+    if (!dst_on_device) {
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            dpct::get_current_device().queues_wait_and_throw()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_set_peer_access(const int n_tokens) {
+    static bool peer_access_enabled = false;
+
+    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
+
+    if (peer_access_enabled == enable_peer_access) {
+        return;
+    }
+
+#ifdef NDEBUG
+    for (int id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(ggml_cuda_set_device(id));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    for (int id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(ggml_cuda_set_device(id));
+
+        for (int id_other = 0; id_other < g_device_count; ++id_other) {
+            if (id == id_other) {
+                continue;
+            }
+            if (id != g_main_device && id_other != g_main_device) {
+                continue;
+            }
+
+            int can_access_peer;
+            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            if (can_access_peer) {
+                if (enable_peer_access) {
+                    CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+                } else {
+                    CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
+                }
+            }
+        }
+    }
+#endif // NDEBUG
+
+    peer_access_enabled = enable_peer_access;
+}
+
+static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 ggml_cuda_op_mul_mat_t op,
+                                 const bool convert_src1_to_q8_1) try {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int64_t nrows1 = ggml_nrows(src1);
+
+    GGML_ASSERT(ne03 == ne13);
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    const int nb2 = dst->nb[2];
+    const int nb3 = dst->nb[3];
+
+    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
+
+    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
+
+    const int64_t i02_divisor = ne12 / ne02;
+
+    const size_t src0_ts = ggml_type_size(src0->type);
+    const size_t src0_bs = ggml_blck_size(src0->type);
+    const size_t q8_1_ts = sizeof(block_q8_1);
+    const size_t q8_1_bs = QK8_1;
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src0_is_contiguous = ggml_is_contiguous(src0);
+    const bool src1_is_contiguous = ggml_is_contiguous(src1);
+
+    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
+
+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+    GGML_ASSERT(!(split && ne02 > 1));
+    GGML_ASSERT(!(split && ne03 > 1));
+    GGML_ASSERT(!(split && ne02 < ne12));
+
+    // dd = data device
+    char  *  src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+    float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
+    char  * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
+    float *   dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+    // as = actual size
+    size_t  src0_as[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t   dst_as[GGML_CUDA_MAX_DEVICES] = {0};
+
+    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
+    int64_t row_high[GGML_CUDA_MAX_DEVICES];
+
+    int used_devices = 0;
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        // by default, use all rows
+        row_low[id]  = 0;
+        row_high[id] = ne01;
+
+        // for multi GPU, get the row boundaries from tensor split
+        // and round to mul_mat_q tile sizes
+        if (split) {
+            const int64_t rounding = get_row_rounding(src0->type);
+
+            if (id != 0) {
+                row_low[id]  = ne01*g_tensor_split[id];
+                if (row_low[id] < ne01) {
+                    row_low[id] -= row_low[id] % rounding;
+                }
+            }
+
+            if (id != g_device_count - 1) {
+                row_high[id]  = ne01*g_tensor_split[id + 1];
+                if (row_high[id] < ne01) {
+                    row_high[id] -= row_high[id] % rounding;
+                }
+            }
+        }
+    }
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
+
+        used_devices++;
+
+        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+
+        ggml_cuda_set_device(id);
+        const dpct::queue_ptr stream = g_cudaStreams[id][0];
+
+        if (src0_on_device && src0_is_contiguous) {
+            src0_dd[id] = (char *) src0_extra->data_device[id];
+        } else {
+            // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
+            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
+        }
+
+        if (src1_on_device && src1_is_contiguous) {
+            src1_ddf[id] = (float *) src1_extra->data_device[id];
+        } else {
+            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
+        }
+
+        if (convert_src1_to_q8_1) {
+            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
+
+            if (src1_on_device && src1_is_contiguous) {
+                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
+                /*
+                DPCT1010:105: SYCL uses exceptions to report errors and does not
+                use the error codes. The call was replaced with 0. You need to
+                rewrite this code.
+                */
+                CUDA_CHECK(0);
+            }
+        }
+
+        if (dst_on_device) {
+            dst_dd[id] = (float *) dst_extra->data_device[id];
+        } else {
+            const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
+            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
+        }
+    }
+
+    // if multiple devices are used they need to wait for the main device
+    // here an event is recorded that signals that the main device has finished calculating the input data
+    if (split && used_devices > 1) {
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        /*
+        DPCT1024:106: The original code returned the error code that was further
+        consumed by the program logic. This original code was replaced with 0.
+        You may need to rewrite the program logic consuming the error code.
+        */
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            *src0_extra->events[g_main_device][0] =
+                g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier()));
+    }
+
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
+        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
+        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
+
+        for (int64_t id = 0; id < g_device_count; ++id) {
+            if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+                continue;
+            }
+
+            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+            const int64_t row_diff = row_high[id] - row_low[id];
+
+            ggml_cuda_set_device(id);
+            const dpct::queue_ptr stream = g_cudaStreams[id][is];
+
+            // wait for main GPU data if necessary
+            if (split && (id != g_main_device || is != 0)) {
+                CUDA_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
+                    {*src0_extra->events[g_main_device][0]})));
+            }
+
+            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
+                const int64_t i03 = i0 / ne12;
+                const int64_t i02 = i0 % ne12;
+
+                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
+
+                // for split tensors the data begins at i0 == i0_offset_low
+                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
+                float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
+                char  * src1_ddq_i = src1_ddq[id] +  src1_ddq_i_offset;
+                float *   dst_dd_i =   dst_dd[id] + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+
+                // the main device memory buffer can be on VRAM scratch, with space for all partial results
+                // in that case an offset on dst_ddf_i is needed
+                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
+                    dst_dd_i += row_low[id]; // offset is 0 if no tensor split
+                }
+
+                // copy src0, src1 to device if necessary
+                if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
+                    if (id != g_main_device) {
+                        if (convert_src1_to_q8_1) {
+                            char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
+                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                                src1_ddq_i, src1_ddq_i_source,
+                                src1_ncols * src1_padded_col_size * q8_1_ts /
+                                    q8_1_bs)));
+                        } else {
+                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
+                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
+                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                                src1_ddf_i, src1_ddf_i_source,
+                                src1_ncols * ne10 * sizeof(float))));
+                        }
+                    }
+                } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
+                                   src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+                } else {
+                    GGML_ASSERT(false);
+                }
+
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
+                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    /*
+                    DPCT1010:107: SYCL uses exceptions to report errors and does
+                    not use the error codes. The call was replaced with 0. You
+                    need to rewrite this code.
+                    */
+                    CUDA_CHECK(0);
+                }
+
+                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
+                }
+
+                // do the computation
+                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
+                   row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
+                /*
+                DPCT1010:108: SYCL uses exceptions to report errors and does not
+                use the error codes. The call was replaced with 0. You need to
+                rewrite this code.
+                */
+                CUDA_CHECK(0);
+
+                // copy dst to host or other device if necessary
+                if (!dst_on_device) {
+                    void * dst_off_device;
+                    dpct::memcpy_direction kind;
+                    if (dst->backend == GGML_BACKEND_CPU) {
+                        dst_off_device = dst->data;
+                        kind = dpct::device_to_host;
+                    } else if (dst->backend == GGML_BACKEND_GPU) {
+                        dst_off_device = dst_extra->data_device[g_main_device];
+                        kind = dpct::device_to_device;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                    if (split) {
+                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+                        // dst is NOT transposed.
+                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
+                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0 + row_low[id];
+                        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
+                            dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
+                            row_diff * sizeof(float), row_diff * sizeof(float),
+                            src1_ncols, kind, *stream)));
+                    } else {
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0;
+                        CUDA_CHECK(DPCT_CHECK_ERROR(
+                            stream->memcpy(dhf_dst_i, dst_dd_i,
+                                           src1_ncols * ne0 * sizeof(float))));
+                    }
+                }
+
+                // add event for the main device to wait on until other device is done
+                if (split && (id != g_main_device || is != 0)) {
+                    /*
+                    DPCT1024:109: The original code returned the error code that
+                    was further consumed by the program logic. This original
+                    code was replaced with 0. You may need to rewrite the
+                    program logic consuming the error code.
+                    */
+                    CUDA_CHECK(DPCT_CHECK_ERROR(
+                        *src0_extra->events[id][is] =
+                            stream->ext_oneapi_submit_barrier()));
+                }
+            }
+        }
+    }
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
+        CUDA_CHECK(ggml_cuda_set_device(id));
+
+        // free buffers again when done
+        if (dst_as[id] > 0) {
+            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
+        }
+        if (src1_asq[id] > 0) {
+            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
+        }
+        if (src1_asf[id] > 0) {
+            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+        }
+        if (src0_as[id] > 0) {
+            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
+        }
+    }
+
+    // main device waits for all other devices to be finished
+    if (split && g_device_count > 1) {
+        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
+        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
+
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        for (int64_t id = 0; id < g_device_count; ++id) {
+            if (row_low[id] == row_high[id]) {
+                continue;
+            }
+            for (int64_t is = 0; is < is_max; ++is) {
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier(
+                        {*src0_extra->events[id][is]})));
+            }
+        }
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            dpct::get_current_device().queues_wait_and_throw()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
+}
+
+static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
+}
+
+static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
+}
+
+static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
+}
+
+static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
+}
+
+static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
+}
+
+static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
+}
+
+static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
+}
+
+static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
+}
+
+static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
+}
+
+static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
+}
+
+static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
+}
+
+static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
+}
+
+static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
+}
+
+static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
+}
+
+static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
+}
+
+static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
+}
+
+static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
+}
+
+static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
+}
+
+bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (!g_cublas_loaded) return false;
+
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+            src1->type == GGML_TYPE_F32 &&
+             dst->type == GGML_TYPE_F32 &&
+            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
+}
+
+static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst) try {
+    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
+    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
+                                     const ggml_tensor *src1,
+                                     ggml_tensor *dst) try {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+    GGML_ASSERT(!ggml_is_permuted(src0));
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    const int64_t row_stride_x = nb01 / sizeof(sycl::half);
+    const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
+
+    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
+                                   const sycl::half *src1_as_f16, char *dst,
+                                   const void **ptrs_src, void **ptrs_dst,
+                                   int64_t ne12, int64_t ne13, int64_t ne23,
+                                   size_t nb02, size_t nb03, size_t nb12,
+                                   size_t nb13, size_t nbd2, size_t nbd3,
+                                   int64_t r2, int64_t r3,
+                                   const sycl::nd_item<3> &item_ct1) {
+    int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                  item_ct1.get_local_id(2);
+    int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
+                  item_ct1.get_local_id(1);
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int64_t i03 = i13 / r3;
+    int64_t i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2   + i13*nbd3;
+}
+
+static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
+                                                 const ggml_tensor *src1,
+                                                 ggml_tensor *dst) try {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
+    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
+    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
+
+    const int64_t ne1 = ggml_nelements(src1);
+    const int64_t ne  = ggml_nelements(dst);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    CUBLAS_CHECK(
+        DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream));
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+    sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    // convert src1 to fp16
+    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+    GGML_ASSERT(to_fp16_cuda != nullptr);
+
+    cuda_pool_alloc<sycl::half> src1_as_f16(ne1);
+    to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
+
+    cuda_pool_alloc<sycl::half> dst_f16;
+    char * dst_t;
+
+    dpct::library_data_t cu_compute_type = CUBLAS_COMPUTE_16F;
+    dpct::library_data_t cu_data_type = dpct::library_data_t::real_half;
+
+    // dst strides
+    size_t nbd2 = dst->nb[2];
+    size_t nbd3 = dst->nb[3];
+
+    const sycl::half alpha_f16 = 1.0f;
+    const sycl::half beta_f16 = 0.0f;
+
+    const float alpha_f32 = 1.0f;
+    const float beta_f32  = 0.0f;
+
+    const void * alpha = &alpha_f16;
+    const void * beta  = &beta_f16;
+
+    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+        dst_t = (char *) dst_f16.alloc(ne);
+
+        nbd2 /= sizeof(float) / sizeof(sycl::half);
+        nbd3 /= sizeof(float) / sizeof(sycl::half);
+    } else {
+        dst_t = (char *) dst_ddf;
+
+        cu_compute_type = CUBLAS_COMPUTE_32F;
+        cu_data_type = dpct::library_data_t::real_float;
+
+        alpha = &alpha_f32;
+        beta  = &beta_f32;
+    }
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+#if 0
+    // use cublasGemmEx
+    {
+        for (int i13 = 0; i13 < ne13; ++i13) {
+            for (int i12 = 0; i12 < ne12; ++i12) {
+                int i03 = i13 / r3;
+                int i02 = i12 / r2;
+
+                CUBLAS_CHECK(
+                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+                            ne01, ne11, ne10,
+                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
+                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
+                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
+                            cu_compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            }
+        }
+    }
+#else
+    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
+        // use cublasGemmStridedBatchedEx
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
+            g_cublas_handles, oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
+            (const char *)src0_as_f16, dpct::library_data_t::real_half,
+            nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
+            (const char *)src1_as_f16.get(), dpct::library_data_t::real_half,
+            nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta,
+            (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float),
+            ne12 * ne13, cu_compute_type)));
+    } else {
+        // use cublasGemmBatchedEx
+        const int ne23 = ne12*ne13;
+
+        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
+        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);
+
+        sycl::range<3> block_dims(1, ne12, ne13);
+        /*
+        DPCT1049:62: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(main_stream->get_device(),
+                                         {sycl::aspect::fp16});
+            main_stream->submit([&](sycl::handler &cgh) {
+                const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get();
+                const void **ptrs_src_get_ct3 = ptrs_src.get();
+                void **ptrs_dst_get_ct4 = ptrs_dst.get();
+
+                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
+                                 [=](sycl::nd_item<3> item_ct1) {
+                                     k_compute_batched_ptrs(
+                                         src0_as_f16, src1_as_f16_get_ct1,
+                                         dst_t, ptrs_src_get_ct3,
+                                         ptrs_dst_get_ct4, ne12, ne13, ne23,
+                                         nb02, nb03, nb12, nb13, nbd2, nbd3, r2,
+                                         r3, item_ct1);
+                                 });
+            });
+        }
+        /*
+        DPCT1010:110: SYCL uses exceptions to report errors and does not use the
+        error codes. The call was replaced with 0. You need to rewrite this
+        code.
+        */
+        CUDA_CHECK(0);
+
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
+            g_cublas_handles, oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
+            (const void **)(ptrs_src.get() + 0 * ne23),
+            dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
+            (const void **)(ptrs_src.get() + 1 * ne23),
+            dpct::library_data_t::real_half, nb11 / sizeof(float), beta,
+            (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
+            cu_compute_type)));
+    }
+#endif
+
+    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+        to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const bool all_on_device =
+        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
+        (src1->backend == GGML_BACKEND_GPU) &&
+        ( dst->backend == GGML_BACKEND_GPU);
+
+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+
+    int64_t min_compute_capability = INT_MAX;
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            min_compute_capability = g_device_caps[id].cc;
+        }
+    }
+
+#ifdef CUDA_USE_TENSOR_CORES
+    const bool use_tensor_cores = true;
+#else
+    const bool use_tensor_cores = false;
+#endif
+
+    // debug helpers
+    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
+    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
+
+    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+        // KQ single-batch
+        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
+    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+        // KQV single-batch
+        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
+    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+        // KQ + KQV multi-batch
+        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
+    } else if (src0->type == GGML_TYPE_F32) {
+        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
+#ifdef GGML_CUDA_FORCE_DMMV
+            const bool use_mul_mat_vec_q = false;
+#else
+            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
+#endif // GGML_CUDA_FORCE_DMMV
+
+            if (use_mul_mat_vec_q) {
+                // NOTE: this kernel does not support ggml_nrows(src1) > 1
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
+            } else {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
+            }
+        } else {
+            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+
+            // when tensor cores are available, use them for large batch size
+            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
+            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
+                use_mul_mat_q = false;
+            }
+
+            if (use_mul_mat_q) {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
+            } else {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+            }
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+#if 0
+template<typename ... Srcs>
+static __global__ void k_compute_batched_ptrs_id(
+        const void ** ptrs_src, void ** ptrs_dst,
+        int ne12, int ne13,
+        int ne23,
+        int nb02, int nb03,
+        int nb12, int nb13,
+        int nb2, int nb3,
+        int r2, int r3,
+        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
+        const half * src1_f16, half * dst_f16,
+        const int32_t * ids, const int id,
+        Srcs... src0s) {
+
+    int i = ids[id];
+
+    half * src0_f16;
+    const void * srcs_ar[] = { (const half *) src0s... };
+    if (src0_type == GGML_TYPE_F16) {
+        src0_f16 = (half *) srcs_ar[i];
+    } else {
+        src0_f16 = src0_as_f16;
+        if (threadIdx.x == 0 && threadIdx.y == 0) {
+            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
+            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
+        }
+    }
+
+    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
+    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int i03 = i13 / r3;
+    int i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
+}
+
+static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
+    const struct ggml_tensor * ids = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * src00 = dst->src[2];
+
+    const int id = dst->op_params[0];
+
+    GGML_ASSERT(!ggml_is_transposed(src00));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+
+    GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
+    const int64_t ne01 = src00->ne[1];
+    const int64_t ne02 = src00->ne[2];
+    const int64_t ne03 = src00->ne[3];
+
+    //const int64_t nb01 = src00->nb[1];
+    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
+    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    //const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
+    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
+
+    const int64_t ne1 = ggml_nelements(src1);
+    const int64_t ne  = ggml_nelements(dst);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
+
+    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    //void * src0_ddq = src0_extra->data_device[g_main_device];
+    //half * src0_as_f16 = (half *) src0_ddq;
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    // convert src1 to fp16
+    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+    GGML_ASSERT(to_fp16_cuda != nullptr);
+
+    size_t src1_as = 0;
+    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
+    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
+
+    size_t dst_as = 0;
+    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    const half alpha_f16 = 1.0f;
+    const half beta_f16  = 0.0f;
+
+    // use cublasGemmBatchedEx
+    const int ne23 = ne12*ne13;
+
+    const void ** ptrs_src = nullptr;
+          void ** ptrs_dst = nullptr;
+
+    size_t ptrs_src_s = 0;
+    size_t ptrs_dst_s = 0;
+
+    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
+    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
+
+    int64_t src0_ne = ggml_nelements(src00);
+    half * src0_as_f16 = nullptr;
+    size_t src0_as = 0;
+    if (src00->type != GGML_TYPE_F16) {
+        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
+    }
+
+    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
+    dim3 block_dims(ne13, ne12);
+    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
+            ptrs_src, ptrs_dst,
+            ne12, ne13,
+            ne23,
+            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
+            nb12, nb13,
+            dst->nb[2], dst->nb[3],
+            r2, r3,
+            src00->type, src0_as_f16, src0_ne,
+            src1_as_f16, dst_f16,
+            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
+            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    CUBLAS_CHECK(
+    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+            ne01, ne11, ne10,
+            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
+                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
+            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
+            ne23,
+            CUBLAS_COMPUTE_16F,
+            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    if (src0_as != 0) {
+        ggml_cuda_pool_free(src0_as_f16, src0_as);
+    }
+    if (ptrs_src_s != 0) {
+        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
+    }
+    if (ptrs_dst_s != 0) {
+        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
+    }
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+
+    ggml_cuda_pool_free(src1_as_f16, src1_as);
+    ggml_cuda_pool_free(dst_f16, dst_as);
+}
+#endif
+
+static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
+                                 const ggml_tensor *src1,
+                                 ggml_tensor *dst) try {
+#if 0
+    ggml_cuda_mul_mat_id_cublas(dst);
+    // TODO: mmq/mmv support
+#endif
+
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb1  =  dst->nb[1];
+
+    const struct ggml_tensor * ids = src0;
+    const int32_t id = ((int32_t *) dst->op_params)[0];
+    const int32_t n_as = ((int32_t *) dst->op_params)[1];
+
+    std::vector<char> ids_host(ggml_nbytes(ids));
+
+    const dpct::queue_ptr stream = g_cudaStreams[g_main_device][0];
+
+    if (ids->backend == GGML_BACKEND_GPU) {
+        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
+        CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
+    } else {
+        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
+    }
+
+    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
+    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
+
+    ggml_tensor_extra_gpu src1_row_extra;
+    ggml_tensor_extra_gpu dst_row_extra;
+
+    ggml_tensor src1_row = *src1;
+    ggml_tensor dst_row = *dst;
+
+    src1_row.backend = GGML_BACKEND_GPU;
+    dst_row.backend  = GGML_BACKEND_GPU;
+
+    src1_row.extra = &src1_row_extra;
+    dst_row.extra = &dst_row_extra;
+
+    char * src1_original = src1->backend == GGML_BACKEND_CPU ?
+        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
+    char * dst_original  =  dst->backend == GGML_BACKEND_CPU ?
+        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
+
+    if (src1->ne[1] == 1) {
+        GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+        GGML_ASSERT(dst->backend  == GGML_BACKEND_GPU);
+
+        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+            //int32_t row_id;
+            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+
+            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+            GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+
+            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
+            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
+
+            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
+            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
+
+            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+        }
+    } else {
+        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
+        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
+
+        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
+        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();
+
+        const dpct::memcpy_direction src1_kind =
+            src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device
+                                              : dpct::device_to_device;
+        const dpct::memcpy_direction dst_kind = dst->backend == GGML_BACKEND_CPU
+                                                    ? dpct::device_to_host
+                                                    : dpct::device_to_device;
+
+        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
+            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+
+            int64_t num_src1_rows = 0;
+            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+                if (row_id_i != row_id) {
+                    continue;
+                }
+
+                GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
+                                   src1_original + i01 * nb11, nb11)));
+                num_src1_rows++;
+            }
+
+            if (num_src1_rows == 0) {
+                continue;
+            }
+
+            src1_row.ne[1] = num_src1_rows;
+            dst_row.ne[1] = num_src1_rows;
+
+            src1_row.nb[1] = nb11;
+            src1_row.nb[2] = num_src1_rows*nb11;
+            src1_row.nb[3] = num_src1_rows*nb11;
+
+            dst_row.nb[1] = nb1;
+            dst_row.nb[2] = num_src1_rows*nb1;
+            dst_row.nb[3] = num_src1_rows*nb1;
+
+            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+
+            num_src1_rows = 0;
+            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+                if (row_id_i != row_id) {
+                    continue;
+                }
+
+                GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+                CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                    dst_original + i01 * nb1,
+                    dst_contiguous.get() + num_src1_rows * nb1, nb1)));
+                num_src1_rows++;
+            }
+        }
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
+}
+
+static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
+}
+
+static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst) try {
+    const int64_t ne = ggml_nelements(src0);
+    GGML_ASSERT(ne == ggml_nelements(src1));
+
+    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+
+    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
+    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    GGML_ASSERT(src0->ne[3] == 1);
+
+    const int64_t nb00 = src0->nb[0];
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    GGML_ASSERT(src1->ne[3] == 1);
+
+    const int64_t nb10 = src1->nb[0];
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+
+    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
+
+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
+                ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ASSERT(false);
+    }
+
+    (void) dst;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    // TODO: why do we pass dst as src1 here?
+    ggml_cuda_cpy(src0, dst, nullptr);
+    (void) src1;
+}
+
+static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
+}
+
+static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
+}
+
+static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
+}
+
+static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
+}
+
+static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
+}
+
+static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
+}
+
+static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
+}
+
+static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    (void) src0;
+    (void) src1;
+    (void) dst;
+}
+
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
+void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
+    const int64_t nrows = ggml_nrows(tensor);
+
+    const int64_t ne0 = tensor->ne[0];
+
+    const size_t nb1 = tensor->nb[1];
+
+    ggml_backend_type backend = tensor->backend;
+    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
+    memset(extra, 0, sizeof(*extra));
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (backend == GGML_BACKEND_GPU && id != g_main_device) {
+            continue;
+        }
+
+        ggml_cuda_set_device(id);
+
+        int64_t row_low, row_high;
+        if (backend == GGML_BACKEND_GPU) {
+            row_low = 0;
+            row_high = nrows;
+        } else if (backend == GGML_BACKEND_GPU_SPLIT) {
+            const int64_t rounding = get_row_rounding(tensor->type);
+
+            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
+            row_low -= row_low % rounding;
+
+            if (id == g_device_count - 1) {
+                row_high = nrows;
+            } else {
+                row_high = nrows*g_tensor_split[id + 1];
+                row_high -= row_high % rounding;
+            }
+        } else {
+            GGML_ASSERT(false);
+        }
+        if (row_low == row_high) {
+            continue;
+        }
+
+        int64_t nrows_split = row_high - row_low;
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        char * buf;
+        CUDA_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
+                                        size, dpct::get_in_order_queue())));
+        char * buf_host = (char *)data + offset_split;
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(DPCT_CHECK_ERROR(
+                dpct::get_in_order_queue()
+                    .memset(buf + original_size, 0, size - original_size)
+                    .wait()));
+        }
+
+        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                                        .memcpy(buf, buf_host, original_size)
+                                        .wait()));
+
+        extra->data_device[id] = buf;
+
+        if (backend == GGML_BACKEND_GPU_SPLIT) {
+            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+                CUDA_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
+                                                new sycl::event()));
+            }
+        }
+    }
+
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
+    if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (extra->data_device[id] != nullptr) {
+            CUDA_CHECK(ggml_cuda_set_device(id));
+            CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(
+                extra->data_device[id], dpct::get_in_order_queue())));
+        }
+
+        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+            if (extra->events[id][is] != nullptr) {
+                CUDA_CHECK(ggml_cuda_set_device(id));
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    dpct::destroy_event(extra->events[id][is])));
+            }
+        }
+    }
+
+    delete extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static size_t g_temp_tensor_extra_index = 0;
+
+static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+    if (g_temp_tensor_extras == nullptr) {
+        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
+    }
+
+    size_t alloc_index = g_temp_tensor_extra_index;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
+    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    memset(extra, 0, sizeof(*extra));
+
+    return extra;
+}
+
+static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
+                                          bool scratch, bool force_inplace,
+                                          bool no_alloc) try {
+    if (scratch && g_scratch_size == 0) {
+        return;
+    }
+
+    tensor->backend = GGML_BACKEND_GPU;
+
+    // recursively assign CUDA buffers until a compute tensor is found
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
+        const ggml_op src0_op = tensor->src[0]->op;
+        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
+            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
+        }
+    }
+    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
+        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
+    }
+
+    if (scratch && no_alloc) {
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra;
+
+    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
+        tensor->op == GGML_OP_VIEW ||
+        force_inplace;
+    const size_t size = ggml_nbytes(tensor);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t offset = 0;
+        if (tensor->op == GGML_OP_VIEW) {
+            memcpy(&offset, tensor->op_params, sizeof(size_t));
+        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src0_ddc + offset;
+    } else if (tensor->op == GGML_OP_CPY) {
+        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
+        void * src1_ddv = src1_extra->data_device[g_main_device];
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src1_ddv;
+    } else if (scratch) {
+        GGML_ASSERT(size <= g_scratch_size);
+        if (g_scratch_offset + size > g_scratch_size) {
+            g_scratch_offset = 0;
+        }
+
+        char * data = (char *) g_scratch_buffer;
+        if (data == nullptr) {
+            CUDA_CHECK(DPCT_CHECK_ERROR(
+                data = (char *)sycl::malloc_device(
+                    g_scratch_size, dpct::get_in_order_queue())));
+            g_scratch_buffer = data;
+        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = data + g_scratch_offset;
+
+        g_scratch_offset += size;
+
+        GGML_ASSERT(g_scratch_offset <= g_scratch_size);
+    } else { // allocate new buffers outside of scratch
+        void * data;
+        CUDA_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
+                                        size, dpct::get_in_order_queue())));
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            dpct::get_in_order_queue().memset(data, 0, size).wait()));
+        extra = new ggml_tensor_extra_gpu;
+        memset(extra, 0, sizeof(*extra));
+        extra->data_device[g_main_device] = data;
+    }
+
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
+                                     size_t offset) try {
+    if (g_scratch_size == 0) {
+        return;
+    }
+    if (g_scratch_buffer == nullptr) {
+        ggml_cuda_set_device(g_main_device);
+        CUDA_CHECK(
+            DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
+                                 g_scratch_size, dpct::get_in_order_queue())));
+    }
+
+    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
+
+    const bool inplace = tensor->view_src != nullptr;
+
+    if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t view_offset = 0;
+        if (tensor->op == GGML_OP_VIEW) {
+            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
+        }
+        extra->data_device[g_main_device] = src0_ddc + view_offset;
+    } else {
+        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
+    }
+
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(ggml_is_contiguous(tensor));
+
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                                    .memcpy(extra->data_device[g_main_device],
+                                            tensor->data, ggml_nbytes(tensor))
+                                    .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
+}
+
+void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, true, false, true);
+}
+
+void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, false, false, false);
+}
+
+void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, false, true, false);
+}
+
+void ggml_cuda_set_main_device(const int main_device) try {
+    if (main_device >= g_device_count) {
+        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
+                main_device, g_device_count, g_main_device);
+        return;
+    }
+
+    if (g_main_device != main_device && g_device_count > 1) {
+        g_main_device = main_device;
+        dpct::device_info prop;
+        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(g_main_device))));
+        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
+                g_main_device, prop.get_name());
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_set_scratch_size(const size_t scratch_size) {
+    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
+    // it still won't always work as expected, but it's better than nothing
+    if (scratch_size > g_scratch_size) {
+        ggml_cuda_free_scratch();
+    }
+    g_scratch_size = std::max(g_scratch_size, scratch_size);
+}
+
+void ggml_cuda_free_scratch() try {
+    if (g_scratch_buffer == nullptr) {
+        return;
+    }
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(
+        sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
+    g_scratch_buffer = nullptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+    if (!g_cublas_loaded) return false;
+
+    ggml_cuda_func_t func;
+    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
+
+    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
+        return false;
+    }
+
+    if (tensor->op == GGML_OP_MUL_MAT) {
+        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
+#ifndef NDEBUG
+            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
+#endif
+            return false;
+        }
+    }
+
+    switch (tensor->op) {
+        case GGML_OP_REPEAT:
+            func = ggml_cuda_repeat;
+            break;
+        case GGML_OP_GET_ROWS:
+            func = ggml_cuda_get_rows;
+            break;
+        case GGML_OP_DUP:
+            func = ggml_cuda_dup;
+            break;
+        case GGML_OP_ADD:
+            func = ggml_cuda_add;
+            break;
+        case GGML_OP_ACC:
+            func = ggml_cuda_acc;
+            break;
+        case GGML_OP_MUL:
+            func = ggml_cuda_mul;
+            break;
+        case GGML_OP_DIV:
+            func = ggml_cuda_div;
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_GELU:
+                    func = ggml_cuda_gelu;
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    func = ggml_cuda_silu;
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    func = ggml_cuda_gelu_quick;
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    func = ggml_cuda_tanh;
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    func = ggml_cuda_relu;
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            func = ggml_cuda_norm;
+            break;
+        case GGML_OP_GROUP_NORM:
+            func = ggml_cuda_group_norm;
+            break;
+        case GGML_OP_CONCAT:
+            func = ggml_cuda_concat;
+            break;
+        case GGML_OP_UPSCALE:
+            func = ggml_cuda_upscale;
+            break;
+        case GGML_OP_PAD:
+            func = ggml_cuda_pad;
+            break;
+        case GGML_OP_LEAKY_RELU:
+            func = ggml_cuda_leaky_relu;
+            break;
+        case GGML_OP_RMS_NORM:
+            func = ggml_cuda_rms_norm;
+            break;
+        case GGML_OP_MUL_MAT:
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_cuda_mul_mat;
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_cuda_mul_mat_id;
+            break;
+        case GGML_OP_SCALE:
+            func = ggml_cuda_scale;
+            break;
+        case GGML_OP_SQR:
+            func = ggml_cuda_sqr;
+            break;
+        case GGML_OP_CLAMP:
+            func = ggml_cuda_clamp;
+            break;
+        case GGML_OP_CPY:
+            func = ggml_cuda_cpy;
+            break;
+        case GGML_OP_CONT:
+            func = ggml_cuda_dup;
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            func = ggml_cuda_nop;
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            func = ggml_cuda_diag_mask_inf;
+            break;
+        case GGML_OP_SOFT_MAX:
+            func = ggml_cuda_soft_max;
+            break;
+        case GGML_OP_ROPE:
+            func = ggml_cuda_rope;
+            break;
+        case GGML_OP_ALIBI:
+            func = ggml_cuda_alibi;
+            break;
+        case GGML_OP_IM2COL:
+            func = ggml_cuda_im2col;
+            break;
+        case GGML_OP_SUM_ROWS:
+            func = ggml_cuda_sum_rows;
+            break;
+        case GGML_OP_ARGSORT:
+            func = ggml_cuda_argsort;
+            break;
+        default:
+            return false;
+    }
+
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
+        ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
+    }
+
+    if (params->ith != 0) {
+        return true;
+    }
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return true;
+    }
+    func(tensor->src[0], tensor->src[1], tensor);
+    return true;
+}
+
+int ggml_cuda_get_device_count() try {
+    int device_count;
+    if (DPCT_CHECK_ERROR(device_count =
+                             dpct::dev_mgr::instance().device_count()) != 0) {
+        return 0;
+    }
+    return device_count;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_get_device_description(int device, char *description,
+                                      size_t description_size) try {
+    dpct::device_info prop;
+    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+        prop, dpct::dev_mgr::instance().get_device(device))));
+    snprintf(description, description_size, "%s", prop.get_name());
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+#define UNUSED GGML_UNUSED
+
+// cuda buffer
+
+struct ggml_backend_buffer_context_cuda {
+    int device;
+    void * dev_ptr = nullptr;
+    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
+    size_t temp_tensor_extra_index = 0;
+
+    ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
+
+    ~ggml_backend_buffer_context_cuda() {
+        delete[] temp_tensor_extras;
+    }
+
+    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+        if (temp_tensor_extras == nullptr) {
+            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
+        }
+
+        size_t alloc_index = temp_tensor_extra_index;
+        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
+        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
+        memset(extra, 0, sizeof(*extra));
+
+        return extra;
+    }
+};
+
+static void
+ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
+    delete ctx;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    return ctx->dev_ptr;
+}
+
+static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
+                                                 ggml_tensor *tensor) try {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        assert(tensor->view_src->buffer->buft == buffer->buft);
+        tensor->backend = tensor->view_src->backend;
+        tensor->extra = tensor->view_src->extra;
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
+
+    extra->data_device[ctx->device] = tensor->data;
+
+    tensor->backend = GGML_BACKEND_GPU;
+    tensor->extra = extra;
+
+    if (ggml_is_quantized(tensor->type)) {
+        // initialize padding to 0 to avoid possible NaN values
+        int64_t row_low = 0;
+        int64_t row_high = ggml_nrows(tensor);
+        int64_t nrows_split = row_high - row_low;
+
+        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
+                (char *)tensor->data + original_size, 0,
+                padded_size - original_size)));
+        }
+    }
+
+    UNUSED(buffer);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                ggml_tensor *tensor,
+                                                const void *data, size_t offset,
+                                                size_t size) try {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                             .memcpy((char *)tensor->data + offset, data, size)
+                             .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                const ggml_tensor *tensor,
+                                                void *data, size_t offset,
+                                                size_t size) try {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(
+        dpct::get_in_order_queue()
+            .memcpy(data, (const char *)tensor->data + offset, size)
+            .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer,
+                                           uint8_t value) try {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                                    .memset(ctx->dev_ptr, value, buffer->size)
+                                    .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
+    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
+    /* .cpy_tensor_from = */ NULL,
+    /* .cpy_tensor_to   = */ NULL,
+    /* .clear           = */ ggml_backend_cuda_buffer_clear,
+};
+
+// cuda buffer type
+
+static ggml_backend_buffer_t
+ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                           size_t size) try {
+    int device = (int) (intptr_t) buft->context;
+
+    ggml_cuda_set_device(device);
+
+    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
+
+    void * dev_ptr;
+    CUDA_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
+                                    size, dpct::get_in_order_queue())));
+
+    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
+
+    return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+
+    UNUSED(buft);
+}
+
+static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
+    int64_t row_low = 0;
+    int64_t row_high = ggml_nrows(tensor);
+    int64_t nrows_split = row_high - row_low;
+
+    size_t size = ggml_nbytes_split(tensor, nrows_split);
+
+    int64_t ne0 = tensor->ne[0];
+
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return size;
+
+    UNUSED(buft);
+}
+
+static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    return ggml_backend_is_cuda(backend);
+
+    UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
+    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
+    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
+    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
+    /* .is_host          = */ nullptr,
+};
+
+ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
+    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
+
+    static bool ggml_backend_cuda_buffer_type_initialized = false;
+
+    if (!ggml_backend_cuda_buffer_type_initialized) {
+        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
+            ggml_backend_cuda_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
+                /* .context  = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
+            };
+        }
+        ggml_backend_cuda_buffer_type_initialized = true;
+    }
+
+    return &ggml_backend_cuda_buffer_types[device];
+}
+
+// host buffer type
+
+static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_cuda_host_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr = ggml_cuda_host_malloc(size);
+
+    if (ptr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    // FIXME: this is a hack to avoid having to implement a new buffer type
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
+        /* .iface    = */ {
+            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cuda_buffer_type_host;
+}
+
+// backend
+
+struct ggml_backend_context_cuda {
+    int device;
+};
+
+static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
+    return GGML_CUDA_NAME;
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_free(ggml_backend_t backend) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    delete cuda_ctx;
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
+}
+
+static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
+                                               ggml_tensor *tensor,
+                                               const void *data, size_t offset,
+                                               size_t size) try {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+        (char *)tensor->data + offset, data, size)));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
+                                               const ggml_tensor *tensor,
+                                               void *data, size_t offset,
+                                               size_t size) try {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+        data, (const char *)tensor->data + offset, size)));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
+
+    UNUSED(backend);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    GGML_ASSERT(!"not implemented");
+
+    return nullptr;
+
+    UNUSED(backend);
+    UNUSED(cgraph);
+}
+
+static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    ggml_cuda_set_main_device(cuda_ctx->device);
+
+    ggml_compute_params params = {};
+    params.type = GGML_TASK_COMPUTE;
+    params.ith = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+
+        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
+            continue;
+
+        assert(node->backend == GGML_BACKEND_GPU);
+        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+        assert(node->extra != nullptr);
+
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j] != nullptr) {
+                assert(node->src[j]->backend == GGML_BACKEND_GPU);
+                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+                assert(node->src[j]->extra != nullptr);
+            }
+        }
+
+        bool ok = ggml_cuda_compute_forward(&params, node);
+        if (!ok) {
+            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+
+#if 0
+        if (node->type == GGML_TYPE_F32) {
+            cudaDeviceSynchronize();
+            std::vector<float> tmp(ggml_nelements(node), 0.0f);
+            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
+            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
+                ggml_type_name(node->src[0]->type),
+                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
+                node->src[0]->name,
+                node->src[1] ? node->src[1]->name : "none");
+            double sum = 0.0;
+            double sq_sum = 0.0;
+            for (int i = 0; i < ggml_nelements(node); i++) {
+                printf("%f ", tmp[i]);
+                sum += tmp[i];
+                sq_sum += tmp[i]*tmp[i];
+            }
+            printf("\n");
+            printf("sum: %f, ", sum);
+            printf("sq_sum: %f\n", sq_sum);
+        }
+#endif
+    }
+
+    UNUSED(backend);
+}
+
+static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_TANH:
+                    return true;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            {
+                struct ggml_tensor * a;
+                struct ggml_tensor * b;
+                if (op->op == GGML_OP_MUL_MAT) {
+                    a = op->src[0];
+                    b = op->src[1];
+                } else {
+                    a = op->src[2];
+                    b = op->src[1];
+                }
+                if (a->ne[3] != b->ne[3]) {
+                    return false;
+                }
+                return true;
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                        return true;
+                    default:
+                        return false;
+                }
+            } break;
+        case GGML_OP_CPY:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1]->type;
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                return false;
+            } break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_NORM:
+        case GGML_OP_REPEAT:
+        case GGML_OP_DUP:
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_CLAMP:
+        case GGML_OP_CONT:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ROPE:
+        case GGML_OP_ALIBI:
+        case GGML_OP_IM2COL:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ACC:
+        case GGML_OP_CONCAT:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_LEAKY_RELU:
+            return true;
+        default:
+            return false;
+    }
+
+    UNUSED(backend);
+}
+
+static ggml_backend_i cuda_backend_i = {
+    /* .get_name                = */ ggml_backend_cuda_name,
+    /* .free                    = */ ggml_backend_cuda_free,
+    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
+    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
+    /* .cpy_tensor_from_async   = */ NULL,
+    /* .cpy_tensor_to_async     = */ NULL,
+    /* .synchronize             = */ ggml_backend_cuda_synchronize,
+    /* .graph_plan_create       = */ ggml_backend_cuda_graph_plan_create,
+    /* .graph_plan_free         = */ ggml_backend_cuda_graph_plan_free,
+    /* .graph_plan_compute      = */ ggml_backend_cuda_graph_plan_compute,
+    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
+    /* .supports_op             = */ ggml_backend_cuda_supports_op,
+};
+
+ggml_backend_t ggml_backend_cuda_init(int device) {
+    ggml_init_cublas(); // TODO: remove from ggml.c
+
+    if (device < 0 || device >= ggml_cuda_get_device_count()) {
+        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
+        return nullptr;
+    }
+
+    // not strictly necessary, but it may reduce the overhead of the first graph_compute
+    ggml_cuda_set_main_device(device);
+
+    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
+        /* .device = */ device
+    };
+
+    ggml_backend_t cuda_backend = new ggml_backend {
+        /* .interface = */ cuda_backend_i,
+        /* .context   = */ ctx
+    };
+
+    return cuda_backend;
+}
+
+bool ggml_backend_is_cuda(ggml_backend_t backend) {
+    return backend->iface.get_name == ggml_backend_cuda_name;
+}
+
+static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
+    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
+    return cuda_backend;
+
+    UNUSED(params);
+}
+
+extern "C" int ggml_backend_cuda_reg_devices();
+
+int ggml_backend_cuda_reg_devices() {
+    int device_count = ggml_cuda_get_device_count();
+    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
+    for (int i = 0; i < device_count; i++) {
+        char name[128];
+        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
+        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
+    }
+    return device_count;
+}
diff --git a/ggml-sycl.hpp b/ggml-sycl.hpp
new file mode 100644
index 0000000000000..40710da2e8bc8
--- /dev/null
+++ b/ggml-sycl.hpp
@@ -0,0 +1,4 @@
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+typedef half ggml_fp16_t;
+
diff --git a/ggml.h b/ggml.h
index dca7bd9ceb0d5..533f40c9f8f16 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2283,7 +2283,7 @@ extern "C" {
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
     typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
 
-    typedef struct {
+    typedef struct dpct_type_994041 {
         const char      * type_name;
         int               blck_size;
         size_t            type_size;

From 233876936b9f9671e57b6f5848d6ce9055caea08 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Thu, 28 Dec 2023 16:40:42 +0800
Subject: [PATCH 02/90] update init_cublas

---
 ggml-sycl.cpp | 2364 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 1456 insertions(+), 908 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 160cdf63a502f..e74902c98d5ce 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -117,7 +117,7 @@
 
 #include <dpct/lib_common_utils.hpp>
 
-#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define MIN_CC_DP4A   510 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #define CC_VOLTA      700
 #define CC_OFFSET_AMD 1000000
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
@@ -217,7 +217,7 @@ static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 #if DPCT_COMPAT_RT_VERSION >= 12000
     static const char *cublas_get_error_str(const int err) {
         /*
-        DPCT1009:63: SYCL uses exceptions to report errors and does not use the
+        DPCT1009:57: SYCL uses exceptions to report errors and does not use the
         error codes. The original code was commented out and a warning string
         was inserted. You need to rewrite this code.
         */
@@ -249,13 +249,13 @@ static void ggml_cuda_error(const char * stmt, const char * func, const char * f
 }
 
 /*
-DPCT1001:65: The statement could not be removed.
+DPCT1001:59: The statement could not be removed.
 */
 /*
-DPCT1000:66: Error handling if-stmt was detected but could not be rewritten.
+DPCT1000:60: Error handling if-stmt was detected but could not be rewritten.
 */
 /*
-DPCT1009:67: SYCL uses exceptions to report errors and does not use the error
+DPCT1009:61: SYCL uses exceptions to report errors and does not use the error
 codes. The original code was commented out and a warning string was inserted.
 You need to rewrite this code.
 */
@@ -273,16 +273,16 @@ You need to rewrite this code.
 static const char *cu_get_error_str(int err) {
     const char * err_str;
     /*
-    DPCT1007:64: Migration of cuGetErrorString is not supported.
+    DPCT1007:58: Migration of cuGetErrorString is not supported.
     */
     cuGetErrorString(err, &err_str);
     return err_str;
 }
 /*
-DPCT1001:82: The statement could not be removed.
+DPCT1001:76: The statement could not be removed.
 */
 /*
-DPCT1000:83: Error handling if-stmt was detected but could not be rewritten.
+DPCT1000:77: Error handling if-stmt was detected but could not be rewritten.
 */
 #define CU_CHECK(err)                                                          \
     do { auto err_ = (err);                                                    \
@@ -573,14 +573,15 @@ struct ggml_tensor_extra_gpu {
 // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
 inline dpct::err0 ggml_cuda_set_device(const int device) try {
     int current_device;
-    CUDA_CHECK(current_device = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(DPCT_CHECK_ERROR(
+        current_device = dpct::dev_mgr::instance().current_device_id()));
 
     if (device == current_device) {
         return 0;
     }
 
     /*
-    DPCT1093:68: The "device" device may be not the one intended for use. Adjust
+    DPCT1093:62: The "device" device may be not the one intended for use. Adjust
     the selected device if needed.
     */
     return DPCT_CHECK_ERROR(dpct::select_device(device));
@@ -624,13 +625,7 @@ static __dpct_inline__ float warp_reduce_sum(float x,
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         /*
-        DPCT1023:0: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        /*
-        DPCT1096:113: The right-most dimension of the work-group used in the
+        DPCT1096:107: The right-most dimension of the work-group used in the
         SYCL kernel that calls this function may be less than "32". The function
         "dpct::permute_sub_group_by_xor" may return an unexpected result on the
         CPU device. Modify the size of the work-group to ensure that the value
@@ -645,20 +640,8 @@ static __dpct_inline__ sycl::float2
 warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:1: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
                                                 mask);
-        /*
-        DPCT1023:2: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
                                                 mask);
     }
@@ -670,13 +653,7 @@ static __dpct_inline__ float warp_reduce_max(float x,
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         /*
-        DPCT1023:3: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        /*
-        DPCT1096:112: The right-most dimension of the work-group used in the
+        DPCT1096:106: The right-most dimension of the work-group used in the
         SYCL kernel that calls this function may be less than "32". The function
         "dpct::permute_sub_group_by_xor" may return an unexpected result on the
         CPU device. Modify the size of the work-group to ensure that the value
@@ -907,7 +884,7 @@ static void norm_f32(const float * x, float * dst, const int ncols, const float
             s_sum[warp_id] = mean_var;
         }
         /*
-        DPCT1118:4: SYCL group functions and algorithms must be encountered in
+        DPCT1118:0: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         item_ct1.barrier(sycl::access::fence_space::local_space);
@@ -1012,11 +989,11 @@ static void group_norm_f32(const float * x, float * dst, const int group_size, c
             s_sum[warp_id] = tmp;
         }
         /*
-        DPCT1118:5: SYCL group functions and algorithms must be encountered in
+        DPCT1118:1: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:69: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -1043,11 +1020,11 @@ static void group_norm_f32(const float * x, float * dst, const int group_size, c
             s_sum[warp_id] = tmp;
         }
         /*
-        DPCT1118:6: SYCL group functions and algorithms must be encountered in
+        DPCT1118:2: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:70: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:64: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -1087,7 +1064,7 @@ static void rms_norm_f32(const float * x, float * dst, const int ncols, const fl
             s_sum[warp_id] = tmp;
         }
         /*
-        DPCT1118:7: SYCL group functions and algorithms must be encountered in
+        DPCT1118:3: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         item_ct1.barrier(sycl::access::fence_space::local_space);
@@ -1127,8 +1104,8 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
                                             const int iqs, dfloat2 &v) {
     const block_q4_1 * x = (const block_q4_1 *) vx;
 
-    const dfloat d = x[ib].dm[1];
-    const dfloat m = x[ib].dm[0];
+    const dfloat d = x[ib].dm[0];
+    const dfloat m = x[ib].dm[1];
 
     const int vui = x[ib].qs[iqs];
 
@@ -1172,8 +1149,8 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
                                             const int iqs, dfloat2 &v) {
     const block_q5_1 * x = (const block_q5_1 *) vx;
 
-    const dfloat d = x[ib].dm[1];
-    const dfloat m = x[ib].dm[0];
+    const dfloat d = x[ib].dm[0];
+    const dfloat m = x[ib].dm[1];
 
     uint32_t qh;
     memcpy(&qh, x[ib].qh, sizeof(qh));
@@ -1228,8 +1205,8 @@ static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restri
     const uint8_t q = x[i].qs[32*n + l];
     dst_t * y = yy + i*QK_K + 128*n;
 
-    float dall = x[i].dm[1];
-    float dmin = x[i].dm[0];
+    float dall = x[i].dm[0];
+    float dmin = x[i].dm[1];
     y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
     y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
     y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
@@ -1330,8 +1307,8 @@ static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restri
 
     dst_t * y = yy + i*QK_K + 64*il + n*ir;
 
-    const float dall = x[i].dm[1];
-    const float dmin = x[i].dm[0];
+    const float dall = x[i].dm[0];
+    const float dmin = x[i].dm[1];
 
     const uint8_t * q = x[i].qs + 32*il + n*ir;
 
@@ -1371,8 +1348,8 @@ static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restri
 
     dst_t * y = yy + i*QK_K + 64*il + 2*ir;
 
-    const float dall = x[i].dm[1];
-    const float dmin = x[i].dm[0];
+    const float dall = x[i].dm[0];
+    const float dmin = x[i].dm[1];
 
     const uint8_t * ql = x[i].qs + 32*il + 2*ir;
     const uint8_t * qh = x[i].qh + 2*ir;
@@ -1450,7 +1427,7 @@ static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restri
 }
 
 /*
-DPCT1110:8: The total declared local variable size in device function
+DPCT1110:4: The total declared local variable size in device function
 dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
 pressure. Consult with your hardware vendor to find the total register size
 available and adjust the code, or use smaller sub-group size to avoid high
@@ -1500,8 +1477,8 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
         const float   * y = yy + i * QK_K + y_offset;
         const uint8_t * q = x[i].qs + q_offset;
 
-        const float dall = x[i].dm[1];
-        const float dmin = x[i].dm[0];
+        const float dall = x[i].dm[0];
+        const float dmin = x[i].dm[1];
 
         const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
         aux[0] = a[0] & 0x0f0f0f0f;
@@ -1561,12 +1538,6 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
     // sum up partial sums and write back result
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:9: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -1577,7 +1548,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
 }
 
 /*
-DPCT1110:10: The total declared local variable size in device function
+DPCT1110:5: The total declared local variable size in device function
 dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
 pressure. Consult with your hardware vendor to find the total register size
 available and adjust the code, or use smaller sub-group size to avoid high
@@ -1686,12 +1657,6 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
     // sum up partial sums and write back result
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:11: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -1702,7 +1667,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
 }
 
 /*
-DPCT1110:12: The total declared local variable size in device function
+DPCT1110:6: The total declared local variable size in device function
 dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
 pressure. Consult with your hardware vendor to find the total register size
 available and adjust the code, or use smaller sub-group size to avoid high
@@ -1763,8 +1728,8 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
         const float   * y1 = yy + i*QK_K + y_offset;
         const float   * y2 = y1 + 128;
 
-        const float dall = x[i].dm[1];
-        const float dmin = x[i].dm[0];
+        const float dall = x[i].dm[0];
+        const float dmin = x[i].dm[1];
 
         const uint16_t * a = (const uint16_t *)x[i].scales;
         aux[0] = a[im+0] & kmask1;
@@ -1845,12 +1810,6 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
     // sum up partial sums and write back result
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:13: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -1861,7 +1820,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
 }
 
 /*
-DPCT1110:14: The total declared local variable size in device function
+DPCT1110:7: The total declared local variable size in device function
 dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
 pressure. Consult with your hardware vendor to find the total register size
 available and adjust the code, or use smaller sub-group size to avoid high
@@ -1916,8 +1875,8 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
         const float   * y1  = yy + i*QK_K + y_offset;
         const float   * y2  = y1 + 128;
 
-        const float dall = x[i].dm[1];
-        const float dmin = x[i].dm[0];
+        const float dall = x[i].dm[0];
+        const float dmin = x[i].dm[1];
 
         const uint16_t * a = (const uint16_t *)x[i].scales;
         aux[0] = a[im+0] & kmask1;
@@ -1985,12 +1944,6 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
     // sum up partial sums and write back result
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:15: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -2106,12 +2059,6 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
     // sum up partial sums and write back result
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:16: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -2162,20 +2109,8 @@ static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy,
 
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:17: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
                                     item_ct1.get_sub_group(), amax, mask));
-        /*
-        DPCT1023:18: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         sum +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
     }
@@ -2300,9 +2235,9 @@ static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__
 #define VDR_Q4_0_Q8_1_MMQ  4
 
 template <int vdr>
-static __dpct_inline__ float
-vec_dot_q4_0_q8_1_impl(const int *v, const int *u, const float &d4,
-                       const sycl::half2 &ds8, const sycl::stream &stream_ct1) {
+static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
+                                                    const float &d4,
+                                                    const sycl::half2 &ds8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2314,16 +2249,17 @@ vec_dot_q4_0_q8_1_impl(const int *v, const int *u, const float &d4,
         const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
 
         // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
+        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
+        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
     }
 
-    const float2 ds8f = __half22float2(ds8);
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
 
     // second part effectively subtracts 8 from each quant value
-    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
+    return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2331,9 +2267,9 @@ vec_dot_q4_0_q8_1_impl(const int *v, const int *u, const float &d4,
 #define VDR_Q4_1_Q8_1_MMQ  4
 
 template <int vdr>
-static __dpct_inline__ float
-vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4,
-                       const sycl::half2 &ds8, const sycl::stream &stream_ct1) {
+static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
+                                                    const sycl::half2 &dm4,
+                                                    const sycl::half2 &ds8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2345,8 +2281,8 @@ vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4,
         const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
 
         // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
+        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
+        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
     }
 
 #ifdef GGML_CUDA_F16
@@ -2354,16 +2290,18 @@ vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4,
     const float d4d8 = tmp.x;
     const float m4s8 = tmp.y;
 #else
-    const float2 dm4f = __half22float2(dm4);
-    const float2 ds8f = __half22float2(ds8);
-    const float d4d8 = dm4f.x * ds8f.x;
-    const float m4s8 = dm4f.y * ds8f.y;
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const float d4d8 = dm4f.x() * ds8f.x();
+    const float m4s8 = dm4f.y() * ds8f.y();
 #endif // GGML_CUDA_F16
 
     // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
     return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2373,8 +2311,7 @@ vec_dot_q4_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm4,
 template <int vdr>
 static __dpct_inline__ float
 vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
-                       const float &d5, const sycl::half2 &ds8,
-                       const sycl::stream &stream_ct1) {
+                       const float &d5, const sycl::half2 &ds8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2387,22 +2324,25 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
         vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
         vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
         vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+        sumi = dpct::dp4a(vi0, u[2 * i + 0],
+                          sumi); // SIMD dot product of quantized values
 
         int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
         vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
         vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
         vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
         vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+        sumi = dpct::dp4a(vi1, u[2 * i + 1],
+                          sumi); // SIMD dot product of quantized values
     }
 
-    const float2 ds8f = __half22float2(ds8);
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
 
     // second part effectively subtracts 16 from each quant value
-    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
+    return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y());
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2412,8 +2352,7 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
 template <int vdr>
 static __dpct_inline__ float
 vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
-                       const sycl::half2 &dm5, const sycl::half2 &ds8,
-                       const sycl::stream &stream_ct1) {
+                       const sycl::half2 &dm5, const sycl::half2 &ds8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2426,14 +2365,16 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
         vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
         vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
         vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+        sumi = dpct::dp4a(vi0, u[2 * i + 0],
+                          sumi); // SIMD dot product of quantized values
 
         int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
         vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
         vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
         vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
         vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+        sumi = dpct::dp4a(vi1, u[2 * i + 1],
+                          sumi); // SIMD dot product of quantized values
     }
 
 #ifdef GGML_CUDA_F16
@@ -2441,17 +2382,19 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
     const float d5d8 = tmp.x;
     const float m5s8 = tmp.y;
 #else
-    const float2 dm5f = __half22float2(dm5);
-    const float2 ds8f = __half22float2(ds8);
-    const float d5d8 = dm5f.x * ds8f.x;
-    const float m5s8 = dm5f.y * ds8f.y;
+    const sycl::float2 dm5f =
+        dm5.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const float d5d8 = dm5f.x() * ds8f.x();
+    const float m5s8 = dm5f.y() * ds8f.y();
 #endif // GGML_CUDA_F16
 
     // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
     return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
 
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2459,9 +2402,9 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
 #define VDR_Q8_0_Q8_1_MMQ 8
 
 template <int vdr>
-static __dpct_inline__ float
-vec_dot_q8_0_q8_1_impl(const int *v, const int *u, const float &d8_0,
-                       const float &d8_1, const sycl::stream &stream_ct1) {
+static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
+                                                    const float &d8_0,
+                                                    const float &d8_1) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2470,19 +2413,19 @@ vec_dot_q8_0_q8_1_impl(const int *v, const int *u, const float &d8_0,
 #pragma unroll
     for (int i = 0; i < vdr; ++i) {
         // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
+        sumi = dpct::dp4a(v[i], u[i], sumi);
     }
 
     return d8_0*d8_1 * sumi;
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 template <int vdr>
-static __dpct_inline__ float
-vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8,
-                       const sycl::half2 &ds8, const sycl::stream &stream_ct1) {
+static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
+                                                    const sycl::half2 &dm8,
+                                                    const sycl::half2 &ds8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2491,7 +2434,7 @@ vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8,
 #pragma unroll
     for (int i = 0; i < vdr; ++i) {
         // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
+        sumi = dpct::dp4a(v[i], u[i], sumi);
     }
 
 #ifdef GGML_CUDA_F16
@@ -2499,16 +2442,18 @@ vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8,
     const float d8d8 = tmp.x;
     const float m8s8 = tmp.y;
 #else
-    const float2 dm8f = __half22float2(dm8);
-    const float2 ds8f = __half22float2(ds8);
-    const float d8d8 = dm8f.x * ds8f.x;
-    const float m8s8 = dm8f.y * ds8f.y;
+    const sycl::float2 dm8f =
+        dm8.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const float d8d8 = dm8f.x() * ds8f.x();
+    const float m8s8 = dm8f.y() * ds8f.y();
 #endif // GGML_CUDA_F16
 
     // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
     return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2518,8 +2463,7 @@ vec_dot_q8_1_q8_1_impl(const int *v, const int *u, const sycl::half2 &dm8,
 // contiguous v/x values
 static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
     const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
-    const sycl::half2 &dm2, const float *__restrict__ d8,
-    const sycl::stream &stream_ct1) {
+    const sycl::half2 &dm2, const float *__restrict__ d8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2532,20 +2476,25 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
 
         const int vi = (v >> (2*i)) & 0x03030303;
 
-        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+        sumf_d +=
+            d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
 
         // fill int with 4x m
         int m = sc >> 4;
         m |= m <<  8;
         m |= m << 16;
-        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
+        sumf_m += d8[i] *
+                  dpct::dp4a(
+                      m, u[i],
+                      0); // multiply constant q2_K part with sum of q8_1 values
     }
 
-    const float2 dm2f = __half22float2(dm2);
+    const sycl::float2 dm2f =
+        dm2.convert<float, sycl::rounding_mode::automatic>();
 
-    return dm2f.x*sumf_d - dm2f.y*sumf_m;
+    return dm2f.x() * sumf_d - dm2f.y() * sumf_m;
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2553,8 +2502,7 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
 static __dpct_inline__ float
 vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
                            const uint8_t *__restrict__ scales,
-                           const sycl::half2 &dm2, const float &d8,
-                           const sycl::stream &stream_ct1) {
+                           const sycl::half2 &dm2, const float &d8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2574,18 +2522,20 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
 
 #pragma unroll
         for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
-            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
+            sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
+            sumi_m = dpct::dp4a(m, u[i],
+                                sumi_m); // multiply sum of q8_1 values with m
         }
 
         sumi_d += sumi_d_sc * (sc & 0xF);
     }
 
-    const float2 dm2f = __half22float2(dm2);
+    const sycl::float2 dm2f =
+        dm2.convert<float, sycl::rounding_mode::automatic>();
 
-    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
+    return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m);
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2596,8 +2546,7 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
 static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
     const int &vl, const int &vh, const int *__restrict__ u,
     const uint8_t *__restrict__ scales, const int &scale_offset,
-    const float &d3, const float *__restrict__ d8,
-    const sycl::stream &stream_ct1) {
+    const float &d3, const float *__restrict__ d8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2621,14 +2570,15 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
 
         const int vih = ((vh >> i) << 2) & 0x04040404;
 
-        const int vi = __vsubss4(vil, vih);
+        const int vi =
+            dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat());
 
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
     }
 
     return d3 * sumf;
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2636,7 +2586,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
 static __dpct_inline__ float
 vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
                            const int8_t *__restrict__ scales, const float &d3,
-                           const float &d8, const sycl::stream &stream_ct1) {
+                           const float &d8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2647,7 +2597,7 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
         int sumi_sc = 0;
 
         for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+            sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product
         }
 
         sumi += sumi_sc * scales[i0 / (QI8_1/2)];
@@ -2655,7 +2605,7 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
 
     return d3*d8 * sumi;
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2666,8 +2616,7 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
 static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
     const int *__restrict__ v, const int *__restrict__ u,
     const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const float *__restrict__ d8,
-    const sycl::stream &stream_ct1) {
+    const sycl::half2 &dm4, const float *__restrict__ d8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2679,19 +2628,24 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
         const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
         const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
 
-        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
+        const int dot1 =
+            dpct::dp4a(v1i, u[2 * i + 1],
+                       dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product
+        const int dot2 =
+            dpct::dp4a(0x01010101, u[2 * i + 1],
+                       dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u
 
         sumf_d += d8[i] * (dot1 * sc[i]);
         sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
     }
 
-    const float2 dm4f = __half22float2(dm4);
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
 
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
 
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2699,8 +2653,7 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
 static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
     const int *__restrict__ v, const int *__restrict__ u,
     const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8,
-    const sycl::stream &stream_ct1) {
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2713,21 +2666,24 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
 
 #pragma unroll
         for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
+            sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F,
+                                u[i * QI8_1 + j], sumi_d); // SIMD dot product
         }
 
-        const float2 ds8f = __half22float2(ds8[i]);
+        const sycl::float2 ds8f =
+            ds8[i].convert<float, sycl::rounding_mode::automatic>();
 
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+        sumf_d += ds8f.x() * (sc[i] * sumi_d);
+        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
     }
 
-    const float2 dm4f = __half22float2(dm4);
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
 
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
 
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2739,7 +2695,7 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
     const int *__restrict__ vl, const int *__restrict__ vh,
     const int *__restrict__ u, const uint8_t *__restrict__ sc,
     const uint8_t *__restrict__ m, const sycl::half2 &dm5,
-    const float *__restrict__ d8, const sycl::stream &stream_ct1) {
+    const float *__restrict__ d8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2757,20 +2713,25 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
         const int v0i = vl0i | vh0i;
         const int v1i = vl1i | vh1i;
 
-        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
+        const int dot1 =
+            dpct::dp4a(v0i, u[2 * i + 0],
+                       dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product
+        const int dot2 =
+            dpct::dp4a(0x01010101, u[2 * i + 0],
+                       dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u
 
         sumf_d += d8[i] * (dot1 * sc[i]);
         sumf_m += d8[i] * (dot2 * m[i]);
 
     }
 
-    const float2 dm5f = __half22float2(dm5);
+    const sycl::float2 dm5f =
+        dm5.convert<float, sycl::rounding_mode::automatic>();
 
-    return dm5f.x*sumf_d - dm5f.y*sumf_m;
+    return dm5f.x() * sumf_d - dm5f.y() * sumf_m;
 
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2778,8 +2739,7 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
 static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
     const int *__restrict__ v, const int *__restrict__ u,
     const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8,
-    const sycl::stream &stream_ct1) {
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2792,21 +2752,24 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
 
 #pragma unroll
         for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
+            sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j],
+                                sumi_d); // SIMD dot product
         }
 
-        const float2 ds8f = __half22float2(ds8[i]);
+        const sycl::float2 ds8f =
+            ds8[i].convert<float, sycl::rounding_mode::automatic>();
 
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+        sumf_d += ds8f.x() * (sc[i] * sumi_d);
+        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
     }
 
-    const float2 dm4f = __half22float2(dm4);
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
 
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
 
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2814,10 +2777,11 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
 #define VDR_Q6_K_Q8_1_MMQ  8
 
 // contiguous v/x values
-static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(
-    const int &vl, const int &vh, const int *__restrict__ u,
-    const int8_t *__restrict__ scales, const float &d,
-    const float *__restrict__ d8, const sycl::stream &stream_ct1) {
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
+                            const int *__restrict__ u,
+                            const int8_t *__restrict__ scales, const float &d,
+                            const float *__restrict__ d8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2831,14 +2795,15 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(
 
         const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
 
-        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+        const int vi = dpct::vectorized_binary<sycl::char4>(
+            (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
 
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
     }
 
     return d*sumf;
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
@@ -2846,8 +2811,7 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_impl_mmvq(
 static __dpct_inline__ float
 vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
                            const int8_t *__restrict__ sc, const float &d6,
-                           const float *__restrict__ d8,
-                           const sycl::stream &stream_ct1) {
+                           const float *__restrict__ d8) {
 
 #if DPCT_COMPATIBILITY_TEMP >=                                                 \
     MIN_CC_DP4A // lowest compute capability for integer intrinsics
@@ -2855,31 +2819,35 @@ vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
 
 #pragma unroll
     for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
-        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
+        sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
 
 #pragma unroll
         for (int i = i0; i < i0 + 2; ++i) {
-            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
-            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
-
-            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
-            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
+            sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0],
+                                    sumi_d.x()); // SIMD dot product
+            sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1],
+                                    sumi_d.x()); // SIMD dot product
+
+            sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4],
+                                    sumi_d.y()); // SIMD dot product
+            sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5],
+                                    sumi_d.y()); // SIMD dot product
         }
 
-        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
+        sumf_d += d8[i0 / 4] *
+                  (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y());
     }
 
     return d6 * sumf_d;
 
 #else
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 static __dpct_inline__ float
 vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
     const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
 
@@ -2893,8 +2861,7 @@ vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
         u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
     }
 
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds,
-                                                      stream_ct1);
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
 }
 
 template <int mmq_y>
@@ -2961,7 +2928,7 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
     (void)x_qh; (void)x_sc;
 
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
@@ -2975,17 +2942,14 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat(
         u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
     }
 
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>(
-        &x_ql[i * (WARP_SIZE + 1) + k], u,
-        x_dmf[i * (WARP_SIZE / QI4_0) + i / QI4_0 + k / QI4_0],
-        y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)],
-        stream_ct1);
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 }
 
 static __dpct_inline__ float
 vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
     const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
 
@@ -2999,8 +2963,7 @@ vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
         u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
     }
 
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm,
-                                                      bq8_1->ds, stream_ct1);
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
 }
 
 template <int mmq_y>
@@ -3065,7 +3028,7 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
     (void)x_qh; (void)x_sc;
 
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
@@ -3078,17 +3041,14 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat(
         u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
     }
 
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>(
-        &x_ql[i * (WARP_SIZE + 1) + k], u,
-        x_dm[i * (WARP_SIZE / QI4_1) + i / QI4_1 + k / QI4_1],
-        y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)],
-        stream_ct1);
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 }
 
 static __dpct_inline__ float
 vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
     const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
 
@@ -3104,8 +3064,7 @@ vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
         u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
     }
 
-    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d,
-                                                      bq8_1->ds, stream_ct1);
+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
 }
 
 template <int mmq_y>
@@ -3192,7 +3151,7 @@ static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
     (void)x_qh; (void)x_sc;
 
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
@@ -3208,16 +3167,13 @@ static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat(
         u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
     }
 
-    return vec_dot_q8_0_q8_1_impl<QR5_0 * VDR_Q5_0_Q8_1_MMQ>(
-        &x_ql[i * (2 * WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx],
-        y_df[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)],
-        stream_ct1);
+    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 }
 
 static __dpct_inline__ float
 vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
     const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
 
@@ -3233,8 +3189,7 @@ vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
         u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
     }
 
-    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm,
-                                                      bq8_1->ds, stream_ct1);
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
 }
 
 template <int mmq_y>
@@ -3316,7 +3271,7 @@ static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
     (void)x_qh; (void)x_sc;
 
     const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
@@ -3330,16 +3285,13 @@ static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat(
         u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
     }
 
-    return vec_dot_q8_1_q8_1_impl<QR5_1 * VDR_Q5_1_Q8_1_MMQ>(
-        &x_ql[i * (2 * WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx],
-        y_ds[j * (WARP_SIZE / QI8_1) + (2 * k / QI8_1) % (WARP_SIZE / QI8_1)],
-        stream_ct1);
+    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
 }
 
 static __dpct_inline__ float
 vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
     const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
 
@@ -3353,7 +3305,7 @@ vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
     }
 
     return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
-                                                      bq8_1->ds[1], stream_ct1);
+                                                      bq8_1->ds[0]);
 }
 
 template <int mmq_y>
@@ -3419,22 +3371,20 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
     (void)x_qh; (void)x_sc;
 
     const float * x_dmf = (const float *) x_dm;
     const float * y_df  = (const float *) y_ds;
 
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>(
-        &x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k],
-        x_dmf[i * (WARP_SIZE / QI8_0) + i / QI8_0 + k / QI8_0],
-        y_df[j * (WARP_SIZE / QI8_1) + k / QI8_1], stream_ct1);
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
+         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
 }
 
 static __dpct_inline__ float
 vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
     const block_q2_K * bq2_K = (const block_q2_K *) vbq;
 
@@ -3450,10 +3400,10 @@ vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
 #pragma unroll
     for (int i = 0; i < QR2_K; ++ i) {
         u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds[1];
+        d8[i] = bq8_1[bq8_offset + i].ds[0];
     }
 
-    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8, stream_ct1);
+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
 }
 
 template <int mmq_y>
@@ -3532,7 +3482,7 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
     (void)x_qh;
 
     const int kbx = k / QI2_K;
@@ -3552,16 +3502,12 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat(
     const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
 
     const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
-    return vec_dot_q2_K_q8_1_impl_mmq(
-        v, &y_qs[index_y], scales,
-        x_dm[i * (WARP_SIZE / QI2_K) + i / QI2_K + kbx], y_df[index_y / QI8_1],
-        stream_ct1);
+    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
 }
 
 static __dpct_inline__ float
 vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
     const block_q3_K * bq3_K = (const block_q3_K *) vbq;
 
@@ -3581,11 +3527,10 @@ vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
 #pragma unroll
     for (int i = 0; i < QR3_K; ++i) {
         u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds[1];
+        d8[i] = bq8_1[bq8_offset + i].ds[0];
     }
 
-    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset,
-                                       d, d8, stream_ct1);
+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
 }
 
 template <int mmq_y>
@@ -3692,7 +3637,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
 
     const int kbx  = k / QI3_K;
     const int ky  = (k % QI3_K) * QR3_K;
@@ -3716,16 +3661,12 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat(
     }
 
     const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
-    return vec_dot_q3_K_q8_1_impl_mmq(
-        v, &y_qs[index_y], scales,
-        x_dmf[i * (WARP_SIZE / QI3_K) + i / QI3_K + kbx], y_df[index_y / QI8_1],
-        stream_ct1);
+    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
 }
 
 static __dpct_inline__ float
 vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
 #ifndef GGML_QKK_64
     const block_q4_K * bq4_K = (const block_q4_K *) vbq;
@@ -3761,14 +3702,14 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
 
     for (int i = 0; i < QR4_K; ++i) {
         const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds[1];
+        d8[i] = bq8i->ds[0];
 
         const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
         u[2*i+0] = q8[0];
         u[2*i+1] = q8[4];
     }
 
-    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8, stream_ct1);
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
 
 #else
 
@@ -3905,22 +3846,19 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
     (void)x_qh;
 
     const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
 
     const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k],
-                                      &y_qs[index_y], sc, sc + 8,
-                                      x_dm[i * (WARP_SIZE / QI4_K) + i / QI4_K],
-                                      &y_ds[index_y / QI8_1], stream_ct1);
+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
 }
 
 static __dpct_inline__ float
 vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
 #ifndef GGML_QKK_64
     const block_q5_K * bq5_K = (const block_q5_K *) vbq;
@@ -3963,8 +3901,7 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
         u[2*i+1] = q8[4];
     }
 
-    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8,
-                                       stream_ct1);
+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
 
 #else
 
@@ -4108,23 +4045,20 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
     (void)x_qh;
 
     const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
 
     const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
     const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
-    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc,
-                                      sc + 8,
-                                      x_dm[i * (WARP_SIZE / QI5_K) + i / QI5_K],
-                                      &y_ds[index_y / QI8_1], stream_ct1);
+    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
 }
 
 static __dpct_inline__ float
 vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs,
-                  const sycl::stream &stream_ct1) {
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
 
     const block_q6_K * bq6_K = (const block_q6_K *) vbq;
 
@@ -4143,11 +4077,10 @@ vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
 #pragma unroll
     for (int i = 0; i < QR6_K; ++i) {
         u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + 2 * i].ds[1];
+        d8[i] = bq8_1[bq8_offset + 2 * i].ds[0];
     }
 
-    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8,
-                                       stream_ct1);
+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
 }
 
 template <int mmq_y>
@@ -4244,7 +4177,7 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k, const sycl::stream &stream_ct1) {
+    const int &i, const int &j, const int &k) {
     (void)x_qh;
 
     const float * x_dmf = (const float *) x_dm;
@@ -4254,10 +4187,7 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
 
     const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
     const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
-    return vec_dot_q6_K_q8_1_impl_mmq(
-        &x_ql[index_x], &y_qs[index_y], sc,
-        x_dmf[i * (WARP_SIZE / QI6_K) + i / QI6_K], &y_df[index_y / QI8_1],
-        stream_ct1);
+    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
 }
 
 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
@@ -4265,7 +4195,7 @@ template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
           load_tiles_cuda_t load_tiles, int vdr,
           vec_dot_q_mul_mat_cuda_t vec_dot>
 /*
-DPCT1110:19: The total declared local variable size in device function mul_mat_q
+DPCT1110:8: The total declared local variable size in device function mul_mat_q
 exceeds 128 bytes and may cause high register pressure. Consult with your
 hardware vendor to find the total register size available and adjust the code,
 or use smaller sub-group size to avoid high register pressure.
@@ -4274,7 +4204,8 @@ static __dpct_inline__ void
 mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
           float *__restrict__ dst, const int ncols_x, const int nrows_x,
           const int ncols_y, const int nrows_y, const int nrows_dst,
-          const sycl::nd_item<3> &item_ct1, int *tile_y_qs,
+          const sycl::nd_item<3> &item_ct1, int *tile_x_ql,
+          sycl::half2 *tile_x_dm, int *tile_x_sc, int *tile_y_qs,
           sycl::half2 *tile_y_ds) {
 
     const block_q_t  * x = (const block_q_t  *) vx;
@@ -4297,7 +4228,53 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    /*
+    DPCT1084:11: The function call "allocate_tiles_q4_0" has multiple migration
+    results in different template instantiations that could not be unified. You
+    may need to adjust the code.
+    */
+    /*
+    DPCT1084:12: The function call "allocate_tiles_q4_1" has multiple migration
+    results in different template instantiations that could not be unified. You
+    may need to adjust the code.
+    */
+    /*
+    DPCT1084:13: The function call "allocate_tiles_q5_0" has multiple migration
+    results in different template instantiations that could not be unified. You
+    may need to adjust the code.
+    */
+    /*
+    DPCT1084:14: The function call "allocate_tiles_q5_1" has multiple migration
+    results in different template instantiations that could not be unified. You
+    may need to adjust the code.
+    */
+    /*
+    DPCT1084:15: The function call "allocate_tiles_q8_0" has multiple migration
+    results in different template instantiations that could not be unified. You
+    may need to adjust the code.
+    */
+    /*
+    DPCT1084:16: The function call "allocate_tiles_q2_K" has multiple migration
+    results in different template instantiations that could not be unified. You
+    may need to adjust the code.
+    */
+    /*
+    DPCT1084:17: The function call "allocate_tiles_q3_K" has multiple migration
+    results in different template instantiations that could not be unified. You
+    may need to adjust the code.
+    */
+    /*
+    DPCT1084:18: The function call "allocate_tiles_q4_K" has multiple migration
+    results in different template instantiations that could not be unified. You
+    may need to adjust the code.
+    */
+    /*
+    DPCT1084:19: The function call "allocate_tiles_q5_K" has multiple migration
+    results in different template instantiations that could not be unified. You
+    may need to adjust the code.
+    */
+    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, tile_x_ql,
+                   tile_x_dm, tile_x_sc);
 
     float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
 
@@ -4347,16 +4324,16 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
                     *dsi_dst = *dsi_src;
                 } else {
                     float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = (*dsi_src)[1];
+                    *dfi_dst = (*dsi_src)[0];
                 }
             }
 
             /*
-            DPCT1118:20: SYCL group functions and algorithms must be encountered
+            DPCT1118:9: SYCL group functions and algorithms must be encountered
             in converged control flow. You may need to adjust the code.
             */
             /*
-            DPCT1065:71: Consider replacing sycl::nd_item::barrier() with
+            DPCT1065:65: Consider replacing sycl::nd_item::barrier() with
             sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
             better performance if there is no access to global memory.
             */
@@ -4377,11 +4354,11 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
             }
 
             /*
-            DPCT1118:21: SYCL group functions and algorithms must be encountered
+            DPCT1118:10: SYCL group functions and algorithms must be encountered
             in converged control flow. You may need to adjust the code.
             */
             /*
-            DPCT1065:72: Consider replacing sycl::nd_item::barrier() with
+            DPCT1065:66: Consider replacing sycl::nd_item::barrier() with
             sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
             better performance if there is no access to global memory.
             */
@@ -4438,7 +4415,8 @@ template <bool need_check> static void
     mul_mat_q4_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4460,9 +4438,12 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
     const int nwarps = NWARPS_Q4_0_AMPERE;
 
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q4_0<mmq_y>,
+              load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ,
+              vec_dot_q4_0_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
@@ -4474,7 +4455,7 @@ template <bool need_check> static void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q4_0_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4503,12 +4484,13 @@ template <bool need_check> static void
     __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
 #endif // defined(RDNA3) || defined(RDNA2)
 #elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
 #endif // __CUDA_ARCH__ < CC_VOLTA
     mul_mat_q4_1(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4530,9 +4512,12 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
     const int nwarps = NWARPS_Q4_1_AMPERE;
 
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q4_1<mmq_y>,
+              load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ,
+              vec_dot_q4_1_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
@@ -4544,7 +4529,7 @@ template <bool need_check> static void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q4_1_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4576,7 +4561,8 @@ template <bool need_check> static void
     mul_mat_q5_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4598,9 +4584,12 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
     const int nwarps = NWARPS_Q5_0_AMPERE;
 
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q5_0<mmq_y>,
+              load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ,
+              vec_dot_q5_0_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
@@ -4612,7 +4601,7 @@ template <bool need_check> static void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q5_0_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4644,7 +4633,8 @@ template <bool need_check> static void
 mul_mat_q5_1(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4666,9 +4656,12 @@ mul_mat_q5_1(
     const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
     const int nwarps = NWARPS_Q5_1_AMPERE;
 
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q5_1<mmq_y>,
+              load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ,
+              vec_dot_q5_1_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
@@ -4680,7 +4673,7 @@ mul_mat_q5_1(
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q5_1_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4712,7 +4705,8 @@ template <bool need_check> static void
     mul_mat_q8_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4734,9 +4728,12 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
     const int nwarps = NWARPS_Q8_0_AMPERE;
 
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q8_0<mmq_y>,
+              load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ,
+              vec_dot_q8_0_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
@@ -4748,7 +4745,7 @@ template <bool need_check> static void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q8_0_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4780,7 +4777,8 @@ template <bool need_check> static void
 mul_mat_q2_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4802,9 +4800,12 @@ mul_mat_q2_K(
     const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
     const int nwarps = NWARPS_Q2_K_AMPERE;
 
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q2_K<mmq_y>,
+              load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ,
+              vec_dot_q2_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
@@ -4816,7 +4817,7 @@ mul_mat_q2_K(
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q2_K_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4845,12 +4846,13 @@ template <bool need_check> static void
     __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
 #endif // defined(RDNA3) || defined(RDNA2)
 #elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-
+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
 #endif // __CUDA_ARCH__ < CC_VOLTA
     mul_mat_q3_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4872,9 +4874,12 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
     const int nwarps = NWARPS_Q3_K_AMPERE;
 
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q3_K<mmq_y>,
+              load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ,
+              vec_dot_q3_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
@@ -4886,7 +4891,7 @@ template <bool need_check> static void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q3_K_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4915,12 +4920,13 @@ template <bool need_check> static void
     __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
 #endif // defined(RDNA3) || defined(RDNA2)
 #elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
 #endif // __CUDA_ARCH__ < CC_VOLTA
     mul_mat_q4_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4942,9 +4948,12 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
     const int nwarps = NWARPS_Q4_K_AMPERE;
 
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q4_K<mmq_y>,
+              load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ,
+              vec_dot_q4_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
@@ -4956,7 +4965,7 @@ template <bool need_check> static void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q4_K_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -4988,7 +4997,8 @@ template <bool need_check> static void
 mul_mat_q5_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -5010,9 +5020,12 @@ mul_mat_q5_K(
     const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
     const int nwarps = NWARPS_Q5_K_AMPERE;
 
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q5_K<mmq_y>,
+              load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ,
+              vec_dot_q5_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
@@ -5024,7 +5037,7 @@ mul_mat_q5_K(
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q5_K_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
@@ -5053,12 +5066,13 @@ template <bool need_check> static void
     __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
 #endif // defined(RDNA3) || defined(RDNA2)
 #elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-
+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
 #endif // __CUDA_ARCH__ < CC_VOLTA
     mul_mat_q6_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::stream &stream_ct1) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -5080,9 +5094,12 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
     const int nwarps = NWARPS_Q6_K_AMPERE;
 
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
+              allocate_tiles_q6_K<mmq_y>,
+              load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ,
+              vec_dot_q6_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
+        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
@@ -5094,14 +5111,13 @@ template <bool need_check> static void
         (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
 #else
     (void) vec_dot_q6_K_q8_1_mul_mat;
-    bad_arch(stream_ct1);
+    bad_arch();
 #endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
 static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
-                          const sycl::nd_item<3> &item_ct1,
-                          const sycl::stream &stream_ct1) {
+                          const sycl::nd_item<3> &item_ct1) {
     const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
                     item_ct1.get_local_id(1);
 
@@ -5130,18 +5146,12 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
             (item_ct1.get_local_id(2) %
              (qi / vdr)); // x block quant index when casting the quants to int
 
-        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs, stream_ct1);
+        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
     }
 
     // sum up partial sums and write back result
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:22: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -5209,12 +5219,6 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
     // sum up partial sums and write back result
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:23: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -5258,7 +5262,7 @@ static void mul_mat_p021_f16_f32(
         // x is transposed and permuted
         const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
         const float xi =
-            sycl::vec<sycl::half, 1>{x[ix]}
+            sycl::vec<sycl::half, 1>(x[ix])
                 .convert<float, sycl::rounding_mode::automatic>()[0];
 
         const int row_y = col_x;
@@ -5276,12 +5280,6 @@ static void mul_mat_p021_f16_f32(
     // sum up partial sums and write back result
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:24: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -5326,7 +5324,7 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
         const int iy = channel*nrows_y + row_y;
 
         const float xi =
-            sycl::vec<sycl::half, 1>{x[ix]}
+            sycl::vec<sycl::half, 1>(x[ix])
                 .convert<float, sycl::rounding_mode::automatic>()[0];
 
         tmp += xi * y[iy];
@@ -5335,12 +5333,6 @@ static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
     // sum up partial sums and write back result
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:25: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
         tmp +=
             dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
     }
@@ -5361,7 +5353,7 @@ static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
     const float * xi = (const float *) cxi;
     sycl::half *dsti = (sycl::half *)cdsti;
 
-    *dsti = sycl::vec<float, 1>{(*xi)}
+    *dsti = sycl::vec<float, 1>(*xi)
                 .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
 }
 
@@ -5729,7 +5721,7 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
         dst_row[col] = col;
     }
     /*
-    DPCT1065:73: Consider replacing sycl::nd_item::barrier() with
+    DPCT1065:67: Consider replacing sycl::nd_item::barrier() with
     sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
     performance if there is no access to global memory.
     */
@@ -5750,11 +5742,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
                 }
             }
             /*
-            DPCT1118:26: SYCL group functions and algorithms must be encountered
+            DPCT1118:20: SYCL group functions and algorithms must be encountered
             in converged control flow. You may need to adjust the code.
             */
             /*
-            DPCT1065:74: Consider replacing sycl::nd_item::barrier() with
+            DPCT1065:68: Consider replacing sycl::nd_item::barrier() with
             sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
             better performance if there is no access to global memory.
             */
@@ -5806,11 +5798,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
             buf[lane_id] = -INFINITY;
         }
         /*
-        DPCT1118:27: SYCL group functions and algorithms must be encountered in
+        DPCT1118:21: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:75: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:69: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -5820,11 +5812,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
             buf[warp_id] = max_val;
         }
         /*
-        DPCT1118:28: SYCL group functions and algorithms must be encountered in
+        DPCT1118:22: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:76: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:70: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -5852,11 +5844,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
             buf[lane_id] = 0.f;
         }
         /*
-        DPCT1118:29: SYCL group functions and algorithms must be encountered in
+        DPCT1118:23: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:77: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:71: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -5866,11 +5858,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
             buf[warp_id] = tmp;
         }
         /*
-        DPCT1118:30: SYCL group functions and algorithms must be encountered in
+        DPCT1118:24: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:78: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:72: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -5938,12 +5930,12 @@ static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
 
     if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
         dst[offset_dst] =
-            sycl::vec<float, 1>{0.0f}
+            sycl::vec<float, 1>(0.0f)
                 .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
     } else {
         const int64_t offset_src = item_ct1.get_group(0) * offset_delta;
         dst[offset_dst] =
-            sycl::vec<float, 1>{x[offset_src + iih * IW + iiw]}
+            sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
                 .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
     }
 }
@@ -6009,6 +6001,7 @@ static void get_rows_cuda_float(const ggml_tensor *src0,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) {
@@ -6106,11 +6099,12 @@ struct bin_bcast_cuda {
 
             sycl::range<3> block_dims(1, 1, 1);
             block_dims[2] = std::min<unsigned int>(hne0, block_size);
-            block_dims[1] =
-                std::min<unsigned int>(ne1, block_size / block_dims[2]);
+            block_dims[1] = std::min<unsigned int>(
+                ne1, block_size / (unsigned int)block_dims[2]);
             block_dims[0] = std::min(
-                std::min<unsigned int>(ne2 * ne3, block_size / block_dims[2] /
-                                                      block_dims[1]),
+                std::min<unsigned int>(
+                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
+                                   (unsigned int)block_dims[1]),
                 64U);
 
             sycl::range<3> block_nums(
@@ -6124,6 +6118,7 @@ struct bin_bcast_cuda {
                 {
                     dpct::has_capability_or_fail(stream->get_device(),
                                                  {sycl::aspect::fp16});
+
                     stream->parallel_for(
                         sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
                                               sycl::range<3>(1, 1, block_size),
@@ -6137,13 +6132,14 @@ struct bin_bcast_cuda {
                 }
             } else {
                 /*
-                DPCT1049:31: The work-group size passed to the SYCL kernel may
+                DPCT1049:25: The work-group size passed to the SYCL kernel may
                 exceed the limit. To get the device limit, query
                 info::device::max_work_group_size. Adjust the work-group size if
                 needed.
                 */
                 dpct::has_capability_or_fail(stream->get_device(),
                                              {sycl::aspect::fp16});
+
                 stream->parallel_for(
                     sycl::nd_range<3>(block_nums * block_dims, block_dims),
                     [=](sycl::nd_item<3> item_ct1) {
@@ -6279,7 +6275,7 @@ static void norm_f32_cuda(const float *x, float *dst, const int ncols,
     } else {
         const sycl::range<3> block_dims(1, 1, 1024);
         /*
-        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -6324,7 +6320,7 @@ static void group_norm_f32_cuda(const float *x, float *dst,
     } else {
         const sycl::range<3> block_dims(1, 1, 1024);
         /*
-        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -6411,7 +6407,7 @@ static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
     } else {
         const sycl::range<3> block_dims(1, 1, 1024);
         /*
-        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -6440,6 +6436,7 @@ static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(num_blocks * block_size, block_size),
             [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6456,6 +6453,7 @@ static void dequantize_block_cuda(const void *__restrict__ vx,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(
                 sycl::range<3>(1, 1, num_blocks) *
@@ -6475,6 +6473,7 @@ static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                    sycl::range<3>(1, 1, 64),
                                                sycl::range<3>(1, 1, 64)),
@@ -6495,6 +6494,7 @@ static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                    sycl::range<3>(1, 1, 64),
                                                sycl::range<3>(1, 1, 64)),
@@ -6514,6 +6514,7 @@ static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                    sycl::range<3>(1, 1, 32),
                                                sycl::range<3>(1, 1, 32)),
@@ -6531,6 +6532,7 @@ static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                    sycl::range<3>(1, 1, 64),
                                                sycl::range<3>(1, 1, 64)),
@@ -6551,6 +6553,7 @@ static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
                                                    sycl::range<3>(1, 1, 64),
                                                sycl::range<3>(1, 1, 64)),
@@ -6633,6 +6636,7 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6653,6 +6657,7 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6673,6 +6678,7 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6693,6 +6699,7 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6713,6 +6720,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6810,6 +6818,7 @@ static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6827,17 +6836,13 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
-                              vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
+                          vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy,
@@ -6848,17 +6853,13 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
-                              vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
+                          vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy,
@@ -6869,17 +6870,13 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
-                              vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
+                          vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy,
@@ -6890,17 +6887,13 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
-                              vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
+                          vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy,
@@ -6911,17 +6904,13 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
-                              vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
+                          vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy,
@@ -6932,17 +6921,13 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
-                              vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
+                          vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy,
@@ -6953,17 +6938,13 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
-                              vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
+                          vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy,
@@ -6974,17 +6955,13 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
-                              vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
+                          vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy,
@@ -6995,17 +6972,13 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
-                              vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
+                          vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
@@ -7016,17 +6989,13 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
     const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
     const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->submit([&](sycl::handler &cgh) {
-        sycl::stream stream_ct1(64 * 1024, 80, cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
-                              vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                                 item_ct1, stream_ct1);
-            });
-    });
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
+                          vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
 }
 
 static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
@@ -7036,7 +7005,8 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7068,39 +7038,99 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:29: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:30: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 }
 catch (sycl::exception const &exc) {
@@ -7116,7 +7146,8 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7148,39 +7179,101 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:31: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        //zjy const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(mmq_y /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   mmq_y /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 }
 catch (sycl::exception const &exc) {
@@ -7196,7 +7289,8 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7228,39 +7322,99 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 }
 catch (sycl::exception const &exc) {
@@ -7276,7 +7430,8 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7308,39 +7463,99 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 }
 catch (sycl::exception const &exc) {
@@ -7356,7 +7571,8 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7388,39 +7604,99 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q8_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q8_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 }
 catch (sycl::exception const &exc) {
@@ -7436,7 +7712,8 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7468,39 +7745,99 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q2_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:46: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q2_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 }
 catch (sycl::exception const &exc) {
@@ -7518,7 +7855,8 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
 #if QK_K == 256
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7550,39 +7888,99 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q3_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:48: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q3_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 #endif
 }
@@ -7599,7 +7997,8 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7631,39 +8030,99 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:49: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:50: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 }
 catch (sycl::exception const &exc) {
@@ -7679,7 +8138,8 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7711,39 +8171,99 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:51: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:52: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:46: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 }
 catch (sycl::exception const &exc) {
@@ -7759,7 +8279,8 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7791,39 +8312,99 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:53: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q6_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:54: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:48: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::stream stream_ct1(64 * 1024, 80, cgh);
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
 
-            cgh.parallel_for(
-                sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                [=](sycl::nd_item<3> item_ct1) {
-                    mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x,
-                                             ncols_y, nrows_y, nrows_dst,
-                                             stream_ct1);
-                });
-        });
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (2 * WARP_SIZE) +
+                                   dpct_placeholder /*Fix the type mannually*/),
+                    cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / QI6_K) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
+                                       (WARP_SIZE / 8) +
+                                   dpct_placeholder /*Fix the type mannually*/ /
+                                       8),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q6_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
     }
 }
 catch (sycl::exception const &exc) {
@@ -7844,6 +8425,7 @@ static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -7863,6 +8445,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -7885,6 +8468,7 @@ static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
                                   sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
@@ -7909,6 +8493,7 @@ static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
                                   sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
@@ -7990,6 +8575,7 @@ static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
                                   sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
@@ -8038,12 +8624,13 @@ static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
     const sycl::range<3> block_nums(1, num_blocks_x, nrows);
     if (pos == nullptr) {
         /*
-        DPCT1049:55: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:49: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) {
@@ -8053,12 +8640,13 @@ static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
             });
     } else {
         /*
-        DPCT1049:56: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:50: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) {
@@ -8085,12 +8673,13 @@ static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
 
     if (pos == nullptr) {
         /*
-        DPCT1049:57: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:51: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) {
@@ -8101,12 +8690,13 @@ static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
             });
     } else {
         /*
-        DPCT1049:58: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:52: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
             [=](sycl::nd_item<3> item_ct1) {
@@ -8168,7 +8758,7 @@ static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
     const sycl::range<3> block_nums(1, nrows, 1);
     if (order == GGML_SORT_ASC) {
         /*
-        DPCT1049:59: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:53: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8179,7 +8769,7 @@ static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
             });
     } else if (order == GGML_SORT_DESC) {
         /*
-        DPCT1049:60: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:54: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8217,13 +8807,13 @@ static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
     const sycl::range<3> block_dims(1, 1, nth);
     const sycl::range<3> block_nums(1, 1, nrows_x);
     /*
-    DPCT1049:61: The work-group size passed to the SYCL kernel may exceed the
+    DPCT1049:55: The work-group size passed to the SYCL kernel may exceed the
     limit. To get the device limit, query info::device::max_work_group_size.
     Adjust the work-group size if needed.
     */
     stream->submit([&](sycl::handler &cgh) {
         /*
-        DPCT1101:111: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
+        DPCT1101:105: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
         replaced with a value. Modify the code to use the original expression,
         provided in comments, if it is correct.
         */
@@ -8250,6 +8840,7 @@ static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH,
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
+
         stream->parallel_for(
             sycl::nd_range<3>(block_nums *
                                   sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE),
@@ -8293,7 +8884,8 @@ static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
 static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 #ifdef DEBUG_CUDA_MALLOC
     int nnz = 0;
     size_t max_size = 0;
@@ -8354,7 +8946,8 @@ catch (sycl::exception const &exc) {
 static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
         cuda_buffer& b = g_cuda_buffer_pool[id][i];
@@ -8377,10 +8970,10 @@ catch (sycl::exception const &exc) {
 #if !defined(GGML_USE_HIPBLAS)
 // pool with virtual memory
 /*
-DPCT1082:79: Migration of CUmemGenericAllocationHandle type is not supported.
+DPCT1082:73: Migration of CUmemGenericAllocationHandle type is not supported.
 */
-// static std::vector<CUmemGenericAllocationHandle>
-//     g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
+static std::vector<CUmemGenericAllocationHandle>
+    g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
 static dpct::device_ptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
 static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
 static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
@@ -8388,7 +8981,8 @@ static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
 static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
     // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
     const size_t alignment = 128;
@@ -8406,54 +9000,54 @@ static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try {
 
         // allocate more physical memory
         /*
-        DPCT1082:80: Migration of CUmemAllocationProp type is not supported.
+        DPCT1082:74: Migration of CUmemAllocationProp type is not supported.
         */
         CUmemAllocationProp prop = {};
         prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
         prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
         prop.location.id = id;
         /*
-        DPCT1082:81: Migration of CUmemGenericAllocationHandle type is not
+        DPCT1082:75: Migration of CUmemGenericAllocationHandle type is not
         supported.
         */
-        // CUmemGenericAllocationHandle handle;
+        CUmemGenericAllocationHandle handle;
         /*
-        DPCT1007:84: Migration of cuMemCreate is not supported.
+        DPCT1007:78: Migration of cuMemCreate is not supported.
         */
-        // CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
+        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
 
         // reserve virtual address space (if not already reserved)
         if (g_cuda_pool_addr[id] == 0) {
             /*
-            DPCT1007:85: Migration of cuMemAddressReserve is not supported.
+            DPCT1007:79: Migration of cuMemAddressReserve is not supported.
             */
-            // CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id],
-            //                              CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id],
+                                         CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
         }
 
         // map at the end of the pool
         /*
-        DPCT1007:86: Migration of cuMemMap is not supported.
+        DPCT1007:80: Migration of cuMemMap is not supported.
         */
-        // CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
-        //                   reserve_size, 0, handle, 0));
+        CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
+                          reserve_size, 0, handle, 0));
 
         // set access
         /*
-        DPCT1082:87: Migration of CUmemAccessDesc type is not supported.
+        DPCT1082:81: Migration of CUmemAccessDesc type is not supported.
         */
         CUmemAccessDesc access = {};
         access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
         access.location.id = id;
         access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
         /*
-        DPCT1007:88: Migration of cuMemSetAccess is not supported.
+        DPCT1007:82: Migration of cuMemSetAccess is not supported.
         */
         CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
                                 reserve_size, &access, 1));
 
         // add to the pool
-        // g_cuda_pool_handles[id].push_back(handle);
+        g_cuda_pool_handles[id].push_back(handle);
         g_cuda_pool_size[id] += reserve_size;
 
         //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
@@ -8482,7 +9076,8 @@ catch (sycl::exception const &exc) {
 static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
 #ifdef DEBUG_CUDA_MALLOC
     printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
@@ -8501,8 +9096,8 @@ catch (sycl::exception const &exc) {
 
 static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
     int id;
-
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     if (g_device_caps[id].vmm) {
         return ggml_cuda_pool_malloc_vmm(size, actual_size);
     } else {
@@ -8517,7 +9112,8 @@ catch (sycl::exception const &exc) {
 
 static void ggml_cuda_pool_free(void *ptr, size_t size) try {
     int id;
-    CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     if (g_device_caps[id].vmm) {
         ggml_cuda_pool_free_vmm(ptr, size);
     } else {
@@ -8573,43 +9169,37 @@ bool ggml_cublas_loaded(void) {
     return g_cublas_loaded;
 }
 
-void print_devices(int device_count){
+void print_devices(){
+    int device_count = dpct::dev_mgr::instance().device_count()
     for (int id = 0; id < device_count; ++id) {
         dpct::device_info prop;
         CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(id))));
-
         fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id,
                 prop.get_name(), prop.get_major_version(),
                 prop.get_minor_version());
     }
 }
 
-int get_env_value(const char *env_name, int default_val){
-    char * user_device_string = getenv(env_name);
-    int user_device_number = -1;
-
-    unsigned n;
-    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < g_device_count) {
-        user_device_number = (int)n;
-    } else {
-        user_device_number=default_val;
-    }
-}
 void ggml_init_cublas() try {
     static bool initialized = false;
 
     if (!initialized) {
+        print_devices();
 
-#ifdef __HIP_PLATFORM_AMD__
-        // Workaround for a rocBLAS bug when using multiple graphics cards:
-        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
-        rocblas_initialize();
-        CUDA_CHECK(cudaDeviceSynchronize());
-#endif
+        char * user_device_string = getenv("GGML_SYCL_DEVICE");
+        int user_device_number = -1;
 
-        g_device_count = dpct::dev_mgr::instance().device_count();
-        if (DPCT_CHECK_ERROR(g_device_count != 0)) {
+        unsigned n;
+        if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < g_device_count) {
+            user_device_number = (int)n;
+        } else {
+            user_device_number=0;
+        }
+
+        if (DPCT_CHECK_ERROR(g_device_count =
+                                 dpct::dev_mgr::instance().device_count()) !=
+            0) {
             initialized = true;
             g_cublas_loaded = false;
             return;
@@ -8628,55 +9218,19 @@ void ggml_init_cublas() try {
         fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
 #endif
         fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
-        print_devices(g_device_count);
 
-	//zjy hardcode, force set to 1 device
+        //zjy hardcode, force set to 1 device
         g_device_count = 1;
-
         for (int id = 0; id < g_device_count; ++id) {
             int device_vmm = 0;
 
-#if !defined(GGML_USE_HIPBLAS)
-            //int device;
-            //CU_CHECK(DPCT_CHECK_ERROR(device = id));
-            /*
-            DPCT1028:89: The cuDeviceGetAttribute was not migrated because
-            parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is
-            unsupported.
-            */
-            /*CU_CHECK(cuDeviceGetAttribute(
-                &device_vmm,
-                CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
-                device));
-            */
-            //if (device_vmm) {
-                /*
-                DPCT1082:90: Migration of CUmemAllocationProp type is not
-                supported.
-                */
-                //CUmemAllocationProp alloc_prop = {};
-                //alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-                //alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-                //alloc_prop.location.id = id;
-                /*
-                DPCT1007:91: Migration of cuMemGetAllocationGranularity is not
-                supported.
-                */
-                //CU_CHECK(cuMemGetAllocationGranularity(
-                //    &g_device_caps[id].vmm_granularity, &alloc_prop,
-                //    CU_MEM_ALLOC_GRANULARITY_MINIMUM));
-            //}
-#endif // !defined(GGML_USE_HIPBLAS)
             g_device_caps[id].vmm = !!device_vmm;
 
             dpct::device_info prop;
-            dpct::get_device_info(
-                prop, dpct::dev_mgr::instance().get_device(id))；
-
-            // CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-            //     prop, dpct::dev_mgr::instance().get_device(id))));
+            CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+                prop, dpct::dev_mgr::instance().get_device(id))));
             /*
-            DPCT1005:92: The SYCL device version is different from CUDA Compute
+            DPCT1005:86: The SYCL device version is different from CUDA Compute
             Compatibility. You may need to rewrite this code.
             */
             fprintf(stderr,
@@ -8690,55 +9244,45 @@ void ggml_init_cublas() try {
             g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
 #else
             /*
-            DPCT1005:93: The SYCL device version is different from CUDA Compute
+            DPCT1005:87: The SYCL device version is different from CUDA Compute
             Compatibility. You may need to rewrite this code.
             */
             g_device_caps[id].cc =
                 100 * prop.get_major_version() + 10 * prop.get_minor_version();
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
         }
-
-        int user_device_number = get_env_value("GGML_SYCL_DEVICE", 0);
-
         for (int id = 0; id < g_device_count; ++id) {
             g_tensor_split[id] /= total_vram;
         }
 
         for (int id = 0; id < g_device_count; ++id) {
-            ggml_cuda_set_device(id)；
-            // CUDA_CHECK(ggml_cuda_set_device(id));
+            CUDA_CHECK(ggml_cuda_set_device(user_device_number));
 
             // create cuda streams
             for (int is = 0; is < MAX_STREAMS; ++is) {
                 /*
-                DPCT1025:105: The SYCL queue is created ignoring the flag and
+                DPCT1025:88: The SYCL queue is created ignoring the flag and
                 priority options.
                 */
-                g_cudaStreams[id][is] =
-                        dpct::get_current_device().create_queue()；
-                // CUDA_CHECK(DPCT_CHECK_ERROR(
-                //     g_cudaStreams[id][is] =
-                //         dpct::get_current_device().create_queue()));
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    g_cudaStreams[id][is] =
+                        dpct::get_current_device().create_queue()));
             }
 
             // create cublas handle
-            g_cublas_handles[id] = &dpct::get_in_order_queue();
-            // CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
-            //                                   &dpct::get_in_order_queue()));
+            CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
+                                              &dpct::get_in_order_queue()));
             /*
-            DPCT1027:107: The call to cublasSetMathMode was replaced with 0
-            because this call is redundant in SYCL.
+            DPCT1027:89: The call to cublasSetMathMode was replaced with 0
+            because this functionality is redundant in SYCL.
             */
             CUBLAS_CHECK(0);
         }
 
         // configure logging to stdout
         // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
-
-
         ggml_cuda_set_device(user_device_number);
         fprintf(stderr, "  set Device %d\n", user_device_number);
-
         initialized = true;
         g_cublas_loaded = true;
     }
@@ -8782,22 +9326,22 @@ void *ggml_cuda_host_malloc(size_t size) try {
     dpct::err0 err = DPCT_CHECK_ERROR(
         ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
     /*
-    DPCT1000:97: Error handling if-stmt was detected but could not be rewritten.
+    DPCT1000:91: Error handling if-stmt was detected but could not be rewritten.
     */
     if (err != 0) {
         // clear the error
         /*
-        DPCT1026:98: The call to cudaGetLastError was removed because this call
-        is redundant in SYCL.
+        DPCT1026:92: The call to cudaGetLastError was removed because this
+        functionality is redundant in SYCL.
         */
         /*
-        DPCT1001:96: The statement could not be removed.
+        DPCT1001:90: The statement could not be removed.
         */
         fprintf(
             stderr,
             "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
             /*
-            DPCT1009:99: SYCL uses exceptions to report errors and does not use
+            DPCT1009:93: SYCL uses exceptions to report errors and does not use
             the error codes. The original code was commented out and a warning
             string was inserted. You need to rewrite this code.
             */
@@ -8839,7 +9383,8 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
         kind = dpct::device_to_device;
         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
         int id;
-        CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            id = dpct::dev_mgr::instance().current_device_id()));
         src_ptr = (char *) extra->data_device[id];
     } else {
         GGML_ASSERT(false);
@@ -8871,10 +9416,10 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
             dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
                 rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
             /*
-            DPCT1001:100: The statement could not be removed.
+            DPCT1001:94: The statement could not be removed.
             */
             /*
-            DPCT1000:101: Error handling if-stmt was detected but could not be
+            DPCT1000:95: Error handling if-stmt was detected but could not be
             rewritten.
             */
             if (r != 0) return r;
@@ -9256,8 +9801,8 @@ inline void ggml_cuda_op_mul_mat_q(
     const int64_t row_diff = row_high - row_low;
 
     int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
     // the main device has a larger memory buffer to hold the results from all GPUs
     // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
@@ -9520,8 +10065,8 @@ inline void ggml_cuda_op_mul_mat_cublas(
     const int64_t row_diff = row_high - row_low;
 
     int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
     // the main device has a larger memory buffer to hold the results from all GPUs
     // ldc == nrows of the matrix that cuBLAS writes into
@@ -9561,7 +10106,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
 
         CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
         CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
-            g_cublas_handles, oneapi::mkl::transpose::trans,
+            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
             &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
             src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
@@ -9588,8 +10133,10 @@ inline void ggml_cuda_op_mul_mat_cublas(
         CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
         CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
             *g_cublas_handles[id], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, alpha,
-            src0_ddf_i, ne00, src1_ddf_i, ne10, beta, dst_dd_i, ldc)));
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+            dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00,
+            src1_ddf_i, ne10, dpct::get_value(&beta, *g_cublas_handles[id]),
+            dst_dd_i, ldc)));
     }
 
     (void) dst;
@@ -9850,7 +10397,7 @@ inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
 
     scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
     /*
-    DPCT1010:102: SYCL uses exceptions to report errors and does not use the
+    DPCT1010:96: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
     */
     CUDA_CHECK(0);
@@ -9875,7 +10422,7 @@ inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
 
     clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
     /*
-    DPCT1010:103: SYCL uses exceptions to report errors and does not use the
+    DPCT1010:97: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
     */
     CUDA_CHECK(0);
@@ -9940,7 +10487,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
     // do the computation
     op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
     /*
-    DPCT1010:104: SYCL uses exceptions to report errors and does not use the
+    DPCT1010:98: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
     */
     CUDA_CHECK(0);
@@ -10131,7 +10678,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
             if (src1_on_device && src1_is_contiguous) {
                 quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
                 /*
-                DPCT1010:105: SYCL uses exceptions to report errors and does not
+                DPCT1010:99: SYCL uses exceptions to report errors and does not
                 use the error codes. The call was replaced with 0. You need to
                 rewrite this code.
                 */
@@ -10152,7 +10699,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
     if (split && used_devices > 1) {
         CUDA_CHECK(ggml_cuda_set_device(g_main_device));
         /*
-        DPCT1024:106: The original code returned the error code that was further
+        DPCT1024:100: The original code returned the error code that was further
         consumed by the program logic. This original code was replaced with 0.
         You may need to rewrite the program logic consuming the error code.
         */
@@ -10229,7 +10776,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
                     quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                     /*
-                    DPCT1010:107: SYCL uses exceptions to report errors and does
+                    DPCT1010:101: SYCL uses exceptions to report errors and does
                     not use the error codes. The call was replaced with 0. You
                     need to rewrite this code.
                     */
@@ -10244,7 +10791,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
                    row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
                 /*
-                DPCT1010:108: SYCL uses exceptions to report errors and does not
+                DPCT1010:102: SYCL uses exceptions to report errors and does not
                 use the error codes. The call was replaced with 0. You need to
                 rewrite this code.
                 */
@@ -10289,7 +10836,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 // add event for the main device to wait on until other device is done
                 if (split && (id != g_main_device || is != 0)) {
                     /*
-                    DPCT1024:109: The original code returned the error code that
+                    DPCT1024:103: The original code returned the error code that
                     was further consumed by the program logic. This original
                     code was replaced with 0. You may need to rewrite the
                     program logic consuming the error code.
@@ -10666,7 +11213,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
         // there is no broadcast and src0, src1 are contiguous across dims 2, 3
         // use cublasGemmStridedBatchedEx
         CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
-            g_cublas_handles, oneapi::mkl::transpose::trans,
+            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const char *)src0_as_f16, dpct::library_data_t::real_half,
             nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
@@ -10683,13 +11230,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
 
         sycl::range<3> block_dims(1, ne12, ne13);
         /*
-        DPCT1049:62: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:56: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
         {
             dpct::has_capability_or_fail(main_stream->get_device(),
                                          {sycl::aspect::fp16});
+
             main_stream->submit([&](sycl::handler &cgh) {
                 const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get();
                 const void **ptrs_src_get_ct3 = ptrs_src.get();
@@ -10707,14 +11255,14 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
             });
         }
         /*
-        DPCT1010:110: SYCL uses exceptions to report errors and does not use the
+        DPCT1010:104: SYCL uses exceptions to report errors and does not use the
         error codes. The call was replaced with 0. You need to rewrite this
         code.
         */
         CUDA_CHECK(0);
 
         CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
-            g_cublas_handles, oneapi::mkl::transpose::trans,
+            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const void **)(ptrs_src.get() + 0 * ne23),
             dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),

From 0c00b4f654268491f82221bc7beb46990f44cd67 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Fri, 29 Dec 2023 14:58:07 +0800
Subject: [PATCH 03/90] add debug functio, commit all help code

---
 README_sycl.md                     |   163 +
 dpcpp_out2/MainSourceFiles.yaml    | 18145 +++++++++++++++++++++++++++
 dpcpp_out2/ggml-alloc.h            |    92 +
 dpcpp_out2/ggml-backend-impl.h     |   116 +
 dpcpp_out2/ggml-backend.h          |   188 +
 dpcpp_out2/ggml-cuda.dp.cpp        | 12724 +++++++++++++++++++
 dpcpp_out2/ggml-cuda.h             |    64 +
 dpcpp_out2/ggml.h                  |  2253 ++++
 dpcpp_out2/ggml.h.yaml             |   100 +
 dpct/atomic.hpp                    |   842 ++
 dpct/blas_utils.hpp                |  1792 +++
 dpct/ccl_utils.hpp                 |   286 +
 dpct/device.hpp                    |   781 ++
 dpct/dnnl_utils.hpp                |  4921 ++++++++
 dpct/dpct.hpp                      |    62 +
 dpct/dpl_extras/algorithm.h        |  2419 ++++
 dpct/dpl_extras/dpcpp_extensions.h |   747 ++
 dpct/dpl_extras/functional.h       |   453 +
 dpct/dpl_extras/iterators.h        |   347 +
 dpct/dpl_extras/memory.h           |  1024 ++
 dpct/dpl_extras/numeric.h          |    32 +
 dpct/dpl_extras/vector.h           |   752 ++
 dpct/dpl_utils.hpp                 |    26 +
 dpct/fft_utils.hpp                 |  1376 ++
 dpct/image.hpp                     |   901 ++
 dpct/kernel.hpp                    |   459 +
 dpct/lapack_utils.hpp              |  1953 +++
 dpct/lib_common_utils.hpp          |   174 +
 dpct/math.hpp                      |  1814 +++
 dpct/memory.hpp                    |  1497 +++
 dpct/rng_utils.hpp                 |   535 +
 dpct/sparse_utils.hpp              |  1385 ++
 dpct/util.hpp                      |  1070 ++
 ggml-sycl.cpp                      |  1240 +-
 ggml-sycl.cpp.base                 | 11951 ++++++++++++++++++
 migrate.sh                         |    18 +
 run.sh                             |    12 +
 setup.sh                           |     7 +
 38 files changed, 71956 insertions(+), 765 deletions(-)
 create mode 100644 README_sycl.md
 create mode 100644 dpcpp_out2/MainSourceFiles.yaml
 create mode 100644 dpcpp_out2/ggml-alloc.h
 create mode 100644 dpcpp_out2/ggml-backend-impl.h
 create mode 100644 dpcpp_out2/ggml-backend.h
 create mode 100644 dpcpp_out2/ggml-cuda.dp.cpp
 create mode 100644 dpcpp_out2/ggml-cuda.h
 create mode 100644 dpcpp_out2/ggml.h
 create mode 100644 dpcpp_out2/ggml.h.yaml
 create mode 100644 dpct/atomic.hpp
 create mode 100644 dpct/blas_utils.hpp
 create mode 100644 dpct/ccl_utils.hpp
 create mode 100644 dpct/device.hpp
 create mode 100644 dpct/dnnl_utils.hpp
 create mode 100644 dpct/dpct.hpp
 create mode 100644 dpct/dpl_extras/algorithm.h
 create mode 100644 dpct/dpl_extras/dpcpp_extensions.h
 create mode 100644 dpct/dpl_extras/functional.h
 create mode 100644 dpct/dpl_extras/iterators.h
 create mode 100644 dpct/dpl_extras/memory.h
 create mode 100644 dpct/dpl_extras/numeric.h
 create mode 100644 dpct/dpl_extras/vector.h
 create mode 100644 dpct/dpl_utils.hpp
 create mode 100644 dpct/fft_utils.hpp
 create mode 100644 dpct/image.hpp
 create mode 100644 dpct/kernel.hpp
 create mode 100644 dpct/lapack_utils.hpp
 create mode 100644 dpct/lib_common_utils.hpp
 create mode 100644 dpct/math.hpp
 create mode 100644 dpct/memory.hpp
 create mode 100644 dpct/rng_utils.hpp
 create mode 100644 dpct/sparse_utils.hpp
 create mode 100644 dpct/util.hpp
 create mode 100644 ggml-sycl.cpp.base
 create mode 100755 migrate.sh
 create mode 100755 run.sh
 create mode 100755 setup.sh

diff --git a/README_sycl.md b/README_sycl.md
new file mode 100644
index 0000000000000..e76b9bbb4b85b
--- /dev/null
+++ b/README_sycl.md
@@ -0,0 +1,163 @@
+# llama.cpp for SYCL
+
+## Background
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+
+oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+
+Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+
+This project is migrated the CUDA code to SYCL to support Intel CPU, GPU and FPGA.
+
+But we focus on GPU performance tuning. If you want to run llama.cpp on Intel CPU, please use llama.cpp CPU release.
+
+## llama.cpp for SYCL
+
+We migrate the CUDA code SYCL. So the SYCL code replace the CUDA funcitions in llama.cpp, without function name change.
+
+That's why the code macro and log incudes CUBLAS flags.
+
+## OS
+
+### Linux
+
+In Linux, we reuse the CMAKE system of base. It's same as base llama.cpp.
+
+Except branch "windows", other branches are for Linux.
+
+### Windows
+
+In Windows, we change the C source files to meet the requirement of C++ compilers.
+
+So the code is saved in branch **windows** only.
+
+It will output 1 execute file: **llamap.cpp.sycl.exe**.
+
+If you want to get more binary files, please change the build prject.
+
+
+## Linux
+
+### Setup Environment
+
+1. Install Intel oneAPI Base toolkit.
+
+2. Setup Local
+
+```
+./setup.sh
+```
+
+### Run
+
+#### Check device id
+
+Run without parameter:
+
+```
+./build/bin/main
+```
+
+Check the id in startup log, like:
+ggml_init_cublas: found 6 CUDA devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3
+  Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0
+  Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0
+  Device 4: Intel(R) UHD Graphics 770, compute capability 3.0
+  Device 5: Intel(R) UHD Graphics 770, compute capability 1.3
+
+#### Put model file to folder **models**
+
+#### Modify run.sh
+
+Up run.sh as above info:
+```
+...
+GGML_SYCL_DEVICE=0
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
+```
+
+#### Run
+```
+./run.sh
+```
+
+
+## Windows
+
+### Setup Environment
+
+1. Install MS Visual Studio 2022.
+
+2. Install Intel oneAPI Base toolkit.
+
+a. Recommend to install all components and with **default path**.
+
+b. During installation, please choose option to enable compiler in MS Visual Studio.
+
+3. Code
+
+Swith to branch **windows**.
+
+Open **llama.cpp.sycl.sln** by Visual Studio 2022.
+
+4. Set oneAPI Path (optional)
+
+If you chagne the oneAPI installation target path, please modify the oneAPI path in the Visual Studio.
+
+Else, skip this step.
+
+### Build
+
+Build by visual Studio 2022 with x64 & Release.
+
+There will be execute file: **llama.cpp.sycl.exe**.
+
+It will take long time to build due to enable AOT on all hardware flatforms (CPU, GPU, FPGA) as default.
+
+To short it, change AOT target flatforms to one in Visual Studio 2022: **Specify SYCL offloading targets for AOT compilition**.
+
+#### Run
+
+#### Enable oneAPI Environment
+
+Run the command in command line or powershell.
+
+'C:\Program Files (x86)\Intel\oneAPI\setvars.bat'
+
+##### Check device id
+
+
+Run without parameter:
+
+```
+.\x64\Release\llama.cpp.sycl.exe
+```
+
+Check the id in startup log, like:
+ggml_init_cublas: found 6 CUDA devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3
+  Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0
+  Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0
+  Device 4: Intel(R) UHD Graphics 770, compute capability 3.0
+  Device 5: Intel(R) UHD Graphics 770, compute capability 1.3
+
+#### Put model file to folder **models**
+
+#### Modify run.sh
+
+Up run.sh as above info:
+```
+...
+set GGML_SYCL_DEVICE=0
+
+.\x64\Release\llama.cpp.sycl.exe -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
+```
+
+#### Run
+```
+.\run.bat
+```
\ No newline at end of file
diff --git a/dpcpp_out2/MainSourceFiles.yaml b/dpcpp_out2/MainSourceFiles.yaml
new file mode 100644
index 0000000000000..472f76ce1182e
--- /dev/null
+++ b/dpcpp_out2/MainSourceFiles.yaml
@@ -0,0 +1,18145 @@
+---
+MainSourceFile:  '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/MainSrcFiles_placehold'
+Replacements:
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          0
+    Length:          0
+    ReplacementText: "#define DPCT_PROFILING_ENABLED\n#define DPCT_COMPAT_RT_VERSION 12010\n#include <sycl/sycl.hpp>\n#include <dpct/dpct.hpp>\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211
+    Length:          0
+    ReplacementText: "\n#include <dpct/blas_utils.hpp>\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          4081
+    Length:          26
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          4107
+    Length:          18
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          4125
+    Length:          23
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          4148
+    Length:          23
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          4176
+    Length:          14
+    ReplacementText: DPCT_COMPAT_RT_VERSION
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          4517
+    Length:          0
+    ReplacementText: "\n#include <cmath>\n\n#include <dpct/lib_common_utils.hpp>\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          8376
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          8431
+    Length:          14
+    ReplacementText: DPCT_COMPAT_RT_VERSION
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          8506
+    Length:          14
+    ReplacementText: int
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          8528
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1009:48: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          8543
+    Length:          26
+    ReplacementText: '"cublasGetStatusString is not supported"/*cublasGetStatusString(err)*/'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          9824
+    Length:          0
+    ReplacementText: "/*\nDPCT1001:50: The statement could not be removed.\n*/\n/*\nDPCT1000:51: Error handling if-stmt was detected but could not be rewritten.\n*/\n/*\nDPCT1009:52: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n*/\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          9886
+    Length:          11
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          9961
+    Length:          24
+    ReplacementText: '"cudaGetErrorString is not supported"/*cudaGetErrorString(err_)*/'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          10064
+    Length:          21
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          10249
+    Length:          8
+    ReplacementText: int
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          10291
+    Length:          0
+    ReplacementText: "    /*\n    DPCT1007:49: Migration of cuGetErrorString is not supported.\n    */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          10350
+    Length:          0
+    ReplacementText: "/*\nDPCT1001:67: The statement could not be removed.\n*/\n/*\nDPCT1000:68: Error handling if-stmt was detected but could not be rewritten.\n*/\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          10412
+    Length:          12
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          10540
+    Length:          14
+    ReplacementText: DPCT_COMPAT_RT_VERSION
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          10822
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          10869
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          10880
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          11159
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          11170
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          11451
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          11462
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          11646
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          11657
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          11942
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          12027
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          12774
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          12985
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          13253
+    Length:          0
+    ReplacementText: ' dpct_type_471834'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          13260
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          13542
+    Length:          0
+    ReplacementText: ' dpct_type_143705'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          13549
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          13854
+    Length:          0
+    ReplacementText: ' dpct_type_673649'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          13861
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          14212
+    Length:          0
+    ReplacementText: ' dpct_type_135589'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          14219
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          14593
+    Length:          0
+    ReplacementText: ' dpct_type_122878'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          14600
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          14868
+    Length:          0
+    ReplacementText: ' dpct_type_143721'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          14875
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          15254
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          15390
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          15640
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          15770
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          16066
+    Length:          0
+    ReplacementText: ' dpct_type_619598'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          16183
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          16445
+    Length:          0
+    ReplacementText: ' dpct_type_138576'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          16713
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          17280
+    Length:          0
+    ReplacementText: ' dpct_type_154943'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          17287
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          18042
+    Length:          0
+    ReplacementText: ' dpct_type_866817'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          18049
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          18523
+    Length:          0
+    ReplacementText: ' dpct_type_107281'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          18669
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20185
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20254
+    Length:          7
+    ReplacementText: '&dpct::get_in_order_queue()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20397
+    Length:          11
+    ReplacementText: 'dpct::event_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20636
+    Length:          11
+    ReplacementText: 'dpct::err0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20687
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20728
+    Length:          30
+    ReplacementText: 'DPCT_CHECK_ERROR(current_device = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20813
+    Length:          11
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20833
+    Length:          0
+    ReplacementText: "    /*\n    DPCT1093:53: The \"device\" device may be not the one intended for use. Adjust the selected device if needed.\n    */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20844
+    Length:          13
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::select_device'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20865
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          20868
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21441
+    Length:          14
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21530
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21555
+    Length:          0
+    ReplacementText: 'const sycl::stream &stream_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21563
+    Length:          91
+    ReplacementText: 'stream_ct1 << "ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n"'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21738
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21749
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21794
+    Length:          0
+    ReplacementText: ",\n                                             const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21861
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1096:98: The right-most dimension of the work-group used in the SYCL kernel that calls this function may be less than \"32\". The function \"dpct::permute_sub_group_by_xor\" may return an unexpected result on the CPU device. Modify the size of the work-group to ensure that the value of the right-most dimension is a multiple of \"32\".\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21874
+    Length:          40
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21946
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21957
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21973
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          21996
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22004
+    Length:          0
+    ReplacementText: ",\n                                              const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22079
+    Length:          3
+    ReplacementText: 'a.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22086
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22138
+    Length:          3
+    ReplacementText: 'a.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22145
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22219
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22230
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22275
+    Length:          0
+    ReplacementText: ",\n                                             const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22342
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1096:97: The right-most dimension of the work-group used in the SYCL kernel that calls this function may be less than \"32\". The function \"dpct::permute_sub_group_by_xor\" may return an unexpected result on the CPU device. Modify the size of the work-group to ensure that the value of the right-most dimension is a multiple of \"32\".\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22354
+    Length:          50
+    ReplacementText: 'sycl::fmax(x, dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22436
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22447
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22535
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22546
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22635
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22646
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22735
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22746
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          22937
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23205
+    Length:          0
+    ReplacementText: ",\n        const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23229
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23240
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23253
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23286
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23297
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23310
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23344
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23355
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23368
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23408
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23419
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23432
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23935
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          23946
+    Length:          9
+    ReplacementText: 'item_ct1.get_group_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          24206
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          24482
+    Length:          0
+    ReplacementText: ",\n        const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          24505
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          24516
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          24529
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          25255
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          25436
+    Length:          0
+    ReplacementText: ', const sycl::nd_item<3> &item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          25458
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          25471
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          25484
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          25849
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          25915
+    Length:          0
+    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26051
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26062
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26075
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26180
+    Length:          51
+    ReplacementText: 'sycl::tanh(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26244
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26310
+    Length:          0
+    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26332
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26343
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26356
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26438
+    Length:          11
+    ReplacementText: 'sycl::native::exp(-x[i])'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26462
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26526
+    Length:          0
+    ReplacementText: ",\n                           const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26592
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26603
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26616
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26705
+    Length:          28
+    ReplacementText: 'sycl::native::exp(GELU_QUICK_COEF * x[i])'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26747
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26805
+    Length:          0
+    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26828
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26839
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26852
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26918
+    Length:          11
+    ReplacementText: 'sycl::tanh((float)(x[i]))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          26941
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27007
+    Length:          0
+    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27029
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27040
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27053
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27120
+    Length:          14
+    ReplacementText: 'sycl::fmax((float)(x[i]), (float)0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27146
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27244
+    Length:          0
+    ReplacementText: ",\n                           const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27267
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27278
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27291
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27357
+    Length:          14
+    ReplacementText: 'sycl::fmax((float)(x[i]), (float)0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27374
+    Length:          17
+    ReplacementText: 'sycl::fmin((float)(x[i]), 0.0f)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27420
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27485
+    Length:          0
+    ReplacementText: ",\n                    const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27507
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27518
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27531
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27647
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27734
+    Length:          0
+    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27758
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27769
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27782
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27815
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27833
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27851
+    Length:          21
+    ReplacementText: 'sycl::float2(0.f, 0.f)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          27986
+    Length:          10
+    ReplacementText: 'mean_var.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28012
+    Length:          10
+    ReplacementText: 'mean_var.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28108
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28153
+    Length:          28
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28204
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28251
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28353
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1118:0: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28361
+    Length:          15
+    ReplacementText: 'item_ct1.barrier(sycl::access::fence_space::local_space)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28456
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28489
+    Length:          10
+    ReplacementText: 'mean_var.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28531
+    Length:          10
+    ReplacementText: 'mean_var.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28591
+    Length:          17
+    ReplacementText: 'sycl::rsqrt(var + eps)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28755
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28856
+    Length:          0
+    ReplacementText: ",\n                       const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28875
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28889
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          28902
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29020
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29047
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29066
+    Length:          9
+    ReplacementText: 'item_ct1.get_group_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29085
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29170
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29201
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29220
+    Length:          9
+    ReplacementText: 'item_ct1.get_group_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29345
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29377
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29405
+    Length:          9
+    ReplacementText: 'item_ct1.get_group_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29477
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29588
+    Length:          0
+    ReplacementText: ",\n                        const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29642
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29656
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29669
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29792
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29883
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29946
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29973
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          29992
+    Length:          9
+    ReplacementText: 'item_ct1.get_group_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30050
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30164
+    Length:          0
+    ReplacementText: ",\n                    const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30183
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30197
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30210
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30329
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30356
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30375
+    Length:          9
+    ReplacementText: 'item_ct1.get_group_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30409
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30430
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30507
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30539
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30697
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30818
+    Length:          0
+    ReplacementText: ",\n                           const sycl::nd_item<3> &item_ct1, float *s_sum"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30838
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          30911
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31155
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31200
+    Length:          27
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31250
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31297
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31394
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1118:1: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31402
+    Length:          15
+    ReplacementText: "/*\n        DPCT1065:54: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31482
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31709
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31754
+    Length:          27
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31804
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31851
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31948
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1118:2: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          31956
+    Length:          15
+    ReplacementText: "/*\n        DPCT1065:55: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32036
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32103
+    Length:          22
+    ReplacementText: 'sycl::rsqrt(variance + eps)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32246
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32337
+    Length:          0
+    ReplacementText: ",\n                         const sycl::nd_item<3> &item_ct1, float *s_sum"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32361
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32372
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32385
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32418
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32679
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32724
+    Length:          27
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32774
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32821
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32918
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1118:3: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          32926
+    Length:          15
+    ReplacementText: 'item_ct1.barrier(sycl::access::fence_space::local_space)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33006
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33076
+    Length:          18
+    ReplacementText: 'sycl::rsqrt(mean + eps)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33230
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33241
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33462
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33483
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33593
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33600
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33621
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33628
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33679
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33690
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33861
+    Length:          20
+    ReplacementText: 'x[ib].dm[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33904
+    Length:          21
+    ReplacementText: 'x[ib].dm[1]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33968
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          33989
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34093
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34100
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34118
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34125
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34173
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34184
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34537
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34579
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34713
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34720
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34742
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34749
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34801
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34812
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          34983
+    Length:          20
+    ReplacementText: 'x[ib].dm[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35026
+    Length:          21
+    ReplacementText: 'x[ib].dm[1]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35222
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35264
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35390
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35397
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35415
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35422
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35470
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35481
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35666
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35695
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35780
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35794
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          35910
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          36001
+    Length:          0
+    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          36026
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          36111
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          36329
+    Length:          19
+    ReplacementText: 'x[i].dm[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          36367
+    Length:          20
+    ReplacementText: 'x[i].dm[1]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          37268
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          37359
+    Length:          0
+    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          37382
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          37481
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          37577
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          39257
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          39566
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          39657
+    Length:          0
+    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          39732
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          39806
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          39991
+    Length:          19
+    ReplacementText: 'x[i].dm[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          40035
+    Length:          20
+    ReplacementText: 'x[i].dm[1]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          40869
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          40960
+    Length:          0
+    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          41035
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          41159
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          41386
+    Length:          19
+    ReplacementText: 'x[i].dm[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          41430
+    Length:          20
+    ReplacementText: 'x[i].dm[1]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          42569
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          42660
+    Length:          0
+    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          42735
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          42859
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          44069
+    Length:          0
+    ReplacementText: "/*\nDPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          44076
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          44233
+    Length:          0
+    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          44360
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          44371
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          44384
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          44669
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          44746
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          45550
+    Length:          19
+    ReplacementText: 'x[i].dm[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          45598
+    Length:          20
+    ReplacementText: 'x[i].dm[1]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          47936
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          47995
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          48048
+    Length:          0
+    ReplacementText: "/*\nDPCT1110:5: The total declared local variable size in device function dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          48055
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          48212
+    Length:          0
+    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          48237
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          48248
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          48261
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          48620
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          48697
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          52351
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          52410
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          52463
+    Length:          0
+    ReplacementText: "/*\nDPCT1110:6: The total declared local variable size in device function dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          52470
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          52627
+    Length:          0
+    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          52652
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          52663
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          52676
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          53015
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          53092
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          54147
+    Length:          19
+    ReplacementText: 'x[i].dm[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          54195
+    Length:          20
+    ReplacementText: 'x[i].dm[1]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          54792
+    Length:          6
+    ReplacementText: 'sycl::float4'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          54899
+    Length:          3
+    ReplacementText: 's.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          54923
+    Length:          3
+    ReplacementText: 's.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          54963
+    Length:          3
+    ReplacementText: 's.z()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          54987
+    Length:          3
+    ReplacementText: 's.w()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          55137
+    Length:          3
+    ReplacementText: 's.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          55151
+    Length:          3
+    ReplacementText: 's.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          55176
+    Length:          3
+    ReplacementText: 's.z()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          55190
+    Length:          3
+    ReplacementText: 's.w()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          57141
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          57245
+    Length:          0
+    ReplacementText: "/*\nDPCT1110:7: The total declared local variable size in device function dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          57252
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          57398
+    Length:          0
+    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          57423
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          57786
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          57832
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          58662
+    Length:          19
+    ReplacementText: 'x[i].dm[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          58710
+    Length:          20
+    ReplacementText: 'x[i].dm[1]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          59016
+    Length:          6
+    ReplacementText: 'sycl::float4'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          59509
+    Length:          5
+    ReplacementText: 'sum.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          59666
+    Length:          5
+    ReplacementText: 'sum.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          59823
+    Length:          5
+    ReplacementText: 'sum.z()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          59980
+    Length:          5
+    ReplacementText: 'sum.w()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          60317
+    Length:          5
+    ReplacementText: 'sum.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          60333
+    Length:          5
+    ReplacementText: 'sum.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          60349
+    Length:          5
+    ReplacementText: 'sum.z()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          60365
+    Length:          5
+    ReplacementText: 'sum.w()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          61535
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          61594
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          61654
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          61811
+    Length:          0
+    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          61938
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          61949
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          61962
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          62194
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          62271
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          65877
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          65988
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66086
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66104
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66181
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66208
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66241
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66436
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66463
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66496
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66613
+    Length:          0
+    ReplacementText: ",\n                          const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66636
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66647
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66660
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66743
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66754
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          66767
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          67043
+    Length:          9
+    ReplacementText: 'sycl::fabs((float)xi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          67153
+    Length:          56
+    ReplacementText: 'sycl::fmax(amax, dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), amax, mask))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          67226
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          67349
+    Length:          14
+    ReplacementText: 'sycl::round(xi / d)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          67453
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          67460
+    Length:          10
+    ReplacementText: 'y[ib].ds.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          67498
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          67505
+    Length:          10
+    ReplacementText: 'y[ib].ds.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          67614
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68030
+    Length:          0
+    ReplacementText: ",\n            const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68072
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68083
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68096
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68132
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68143
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68156
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68190
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68201
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68214
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68254
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68265
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68278
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68848
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68890
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          68947
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69371
+    Length:          0
+    ReplacementText: ",\n            const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69412
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69423
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69436
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69469
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69480
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69493
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69527
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69538
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69551
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69591
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69602
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          69615
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70018
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70116
+    Length:          0
+    ReplacementText: ",\n                             const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70138
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70149
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70164
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70509
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70545
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70789
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70800
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70904
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          70924
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71239
+    Length:          27
+    ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71283
+    Length:          27
+    ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71329
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71343
+    Length:          19
+    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71454
+    Length:          6
+    ReplacementText: 'ds8f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71479
+    Length:          6
+    ReplacementText: 'ds8f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71637
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71648
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71734
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71753
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          71773
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72088
+    Length:          27
+    ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72132
+    Length:          27
+    ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72323
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72337
+    Length:          19
+    ReplacementText: 'dm4.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72368
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72382
+    Length:          19
+    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72426
+    Length:          6
+    ReplacementText: 'dm4f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72435
+    Length:          6
+    ReplacementText: 'ds8f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72466
+    Length:          6
+    ReplacementText: 'dm4f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72475
+    Length:          6
+    ReplacementText: 'ds8f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72814
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72825
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72946
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          72966
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          73452
+    Length:          27
+    ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          73861
+    Length:          27
+    ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          73947
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          73961
+    Length:          19
+    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          74073
+    Length:          6
+    ReplacementText: 'ds8f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          74099
+    Length:          6
+    ReplacementText: 'ds8f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          74257
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          74268
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          74371
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          74390
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          74410
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          74896
+    Length:          27
+    ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          75305
+    Length:          27
+    ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          75536
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          75550
+    Length:          19
+    ReplacementText: 'dm5.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          75581
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          75595
+    Length:          19
+    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          75639
+    Length:          6
+    ReplacementText: 'dm5f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          75648
+    Length:          6
+    ReplacementText: 'ds8f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          75679
+    Length:          6
+    ReplacementText: 'dm5f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          75688
+    Length:          6
+    ReplacementText: 'ds8f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76007
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76018
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76145
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76359
+    Length:          24
+    ReplacementText: 'dpct::dp4a(v[i], u[i], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76511
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76522
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76608
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76627
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76647
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          76861
+    Length:          24
+    ReplacementText: 'dpct::dp4a(v[i], u[i], sumi)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77049
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77063
+    Length:          19
+    ReplacementText: 'dm8.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77094
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77108
+    Length:          19
+    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77152
+    Length:          6
+    ReplacementText: 'dm8f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77161
+    Length:          6
+    ReplacementText: 'ds8f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77192
+    Length:          6
+    ReplacementText: 'dm8f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77201
+    Length:          6
+    ReplacementText: 'ds8f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77525
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77536
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77681
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          77732
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78032
+    Length:          19
+    ReplacementText: 'dpct::dp4a(vi, u[i], 0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78213
+    Length:          18
+    ReplacementText: 'dpct::dp4a(m, u[i], 0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78305
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78319
+    Length:          19
+    ReplacementText: 'dm2.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78352
+    Length:          6
+    ReplacementText: 'dm2f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78368
+    Length:          6
+    ReplacementText: 'dm2f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78479
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78490
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78647
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          78685
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          79137
+    Length:          29
+    ReplacementText: 'dpct::dp4a(v[i], u[i], sumi_d_sc)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          79212
+    Length:          26
+    ReplacementText: 'dpct::dp4a(m, u[i], sumi_m)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          79348
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          79362
+    Length:          19
+    ReplacementText: 'dm2.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          79401
+    Length:          6
+    ReplacementText: 'dm2f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          79417
+    Length:          6
+    ReplacementText: 'dm2f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          79588
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          79599
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          79837
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          80584
+    Length:          19
+    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          80631
+    Length:          19
+    ReplacementText: 'dpct::dp4a(vi, u[i], 0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          80803
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          80814
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          81007
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          81288
+    Length:          27
+    ReplacementText: 'dpct::dp4a(v[i], u[i], sumi_sc)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          81585
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          81596
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          81782
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          81833
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82152
+    Length:          47
+    ReplacementText: 'dpct::dp4a(v1i, u[2*i+1], dpct::dp4a(v0i, u[2*i+0], 0))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82246
+    Length:          61
+    ReplacementText: 'dpct::dp4a(0x01010101, u[2*i+1], dpct::dp4a(0x01010101, u[2*i+0], 0))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82481
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82495
+    Length:          19
+    ReplacementText: 'dm4.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82528
+    Length:          6
+    ReplacementText: 'dm4f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82544
+    Length:          6
+    ReplacementText: 'dm4f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82656
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82667
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82852
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82871
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          82904
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83216
+    Length:          60
+    ReplacementText: 'dpct::dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83323
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83337
+    Length:          22
+    ReplacementText: 'ds8[i].convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83380
+    Length:          6
+    ReplacementText: 'ds8f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83425
+    Length:          6
+    ReplacementText: 'ds8f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83495
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83509
+    Length:          19
+    ReplacementText: 'dm4.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83542
+    Length:          6
+    ReplacementText: 'dm4f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83558
+    Length:          6
+    ReplacementText: 'dm4f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83729
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83740
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          83956
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          84007
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          84524
+    Length:          47
+    ReplacementText: 'dpct::dp4a(v0i, u[2*i+0], dpct::dp4a(v1i, u[2*i+1], 0))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          84618
+    Length:          61
+    ReplacementText: 'dpct::dp4a(0x01010101, u[2*i+0], dpct::dp4a(0x01010101, u[2*i+1], 0))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          84795
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          84809
+    Length:          19
+    ReplacementText: 'dm5.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          84842
+    Length:          6
+    ReplacementText: 'dm5f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          84858
+    Length:          6
+    ReplacementText: 'dm5f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          84970
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          84981
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85166
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85185
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85218
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85530
+    Length:          46
+    ReplacementText: 'dpct::dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85623
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85637
+    Length:          22
+    ReplacementText: 'ds8[i].convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85680
+    Length:          6
+    ReplacementText: 'ds8f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85725
+    Length:          6
+    ReplacementText: 'ds8f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85795
+    Length:          6
+    ReplacementText: 'sycl::float2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85809
+    Length:          19
+    ReplacementText: 'dm4.convert<float, sycl::rounding_mode::automatic>()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85842
+    Length:          6
+    ReplacementText: 'dm4f.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          85858
+    Length:          6
+    ReplacementText: 'dm4f.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          86029
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          86040
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          86250
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          86581
+    Length:          34
+    ReplacementText: 'dpct::vectorized_binary<sycl::char4>((vil | vih), 0x20202020, dpct::sub_sat())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          86668
+    Length:          19
+    ReplacementText: 'dpct::dp4a(vi, u[i], 0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          86837
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          86848
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87050
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87236
+    Length:          4
+    ReplacementText: 'sycl::int2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87362
+    Length:          8
+    ReplacementText: 'sumi_d.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87373
+    Length:          36
+    ReplacementText: 'dpct::dp4a(v[2*i+0], u[2*i+0], sumi_d.x())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87443
+    Length:          8
+    ReplacementText: 'sumi_d.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87454
+    Length:          36
+    ReplacementText: 'dpct::dp4a(v[2*i+1], u[2*i+1], sumi_d.x())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87525
+    Length:          8
+    ReplacementText: 'sumi_d.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87536
+    Length:          36
+    ReplacementText: 'dpct::dp4a(v[2*i+4], u[2*i+4], sumi_d.y())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87606
+    Length:          8
+    ReplacementText: 'sumi_d.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87617
+    Length:          36
+    ReplacementText: 'dpct::dp4a(v[2*i+5], u[2*i+5], sumi_d.y())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87727
+    Length:          8
+    ReplacementText: 'sumi_d.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87749
+    Length:          8
+    ReplacementText: 'sumi_d.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87863
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          87874
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          88517
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          88528
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          88582
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          88621
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_qs_q4_0,\n                                                                     float *tile_x_d_q4_0"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          88658
+    Length:          66
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          88729
+    Length:          72
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          88844
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          88928
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          88939
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          89036
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          89674
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          90202
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          90375
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          90386
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          90476
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          90606
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          91301
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          91312
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          91963
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          91974
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          92028
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          92067
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_qs_q4_1,\n                                                                     sycl::half2 *tile_x_dm_q4_1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          92104
+    Length:          67
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          92176
+    Length:          73
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          92367
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          92378
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          92475
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          93077
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          93545
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          93718
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          93729
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          93819
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          93949
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          94595
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          94606
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95360
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95371
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95425
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95464
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_ql_q5_0,\n                                                                     float *tile_x_d_q5_0"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95501
+    Length:          66
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95572
+    Length:          72
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95687
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95771
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95782
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          95879
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          96481
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          96997
+    Length:          26
+    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(qs0, 0x10101010, dpct::sub_sat())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          97388
+    Length:          26
+    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(qs1, 0x10101010, dpct::sub_sat())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          97814
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          97987
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          97998
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          98088
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          98218
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          98999
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          99010
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          99779
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          99790
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          99844
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          99883
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_ql_q5_1,\n                                                                     sycl::half2 *tile_x_dm_q5_1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          99920
+    Length:          67
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          99992
+    Length:          73
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          100183
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          100194
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          100291
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          100892
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          102055
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          102228
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          102239
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          102329
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          102459
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          103145
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          103156
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          103671
+    Length:          21
+    ReplacementText: 'bq8_1->ds[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          103726
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          103737
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          103791
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          103830
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_qs_q8_0,\n                                                                     float *tile_x_d_q8_0"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          103867
+    Length:          66
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          103938
+    Length:          72
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          104053
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          104137
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          104148
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          104245
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          104883
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          105342
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          105515
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          105526
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          105616
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          105746
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          106175
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          106186
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          106823
+    Length:          36
+    ReplacementText: 'bq8_1[bq8_offset + i].ds[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          106968
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          106979
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          107033
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          107072
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_ql_q2_K,\n                                                                     sycl::half2 *tile_x_dm_q2_K,\n                                                                     int *tile_x_sc_q2_K"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          107097
+    Length:          67
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          107169
+    Length:          73
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          107247
+    Length:          69
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          107462
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          107473
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          107570
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          108160
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          108638
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          108968
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          109220
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          109231
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          109321
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          109451
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          110293
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          110304
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111071
+    Length:          36
+    ReplacementText: 'bq8_1[bq8_offset + i].ds[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111234
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111245
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111299
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111338
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_ql_q3_K,\n                                                                     sycl::half2 *tile_x_dm_q3_K,\n                                                                     int *tile_x_qh_q3_K,\n                                                                     int *tile_x_sc_q3_K"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111347
+    Length:          67
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111419
+    Length:          73
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111497
+    Length:          69
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111571
+    Length:          69
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111814
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111825
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          111922
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          112496
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          113002
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          113332
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          113813
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          114376
+    Length:          39
+    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(sc_low | sc_high, 0x20202020, dpct::sub_sat())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          114500
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          114511
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          114601
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          114731
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          115560
+    Length:          19
+    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          115794
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          115805
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          117199
+    Length:          20
+    ReplacementText: 'bq8i->ds[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          118931
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          118942
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          118996
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          119035
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_ql_q4_K,\n                                                                     sycl::half2 *tile_x_dm_q4_K,\n                                                                     int *tile_x_sc_q4_K"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          119060
+    Length:          67
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          119132
+    Length:          73
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          119210
+    Length:          69
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          119425
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          119436
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          119533
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          120169
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          120702
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          121154
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          121766
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          121777
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          121867
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          121997
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          122461
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          122472
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          123675
+    Length:          21
+    ReplacementText: 'bq8i->ds[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          125442
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          125453
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          125507
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          125546
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_ql_q5_K,\n                                                                     sycl::half2 *tile_x_dm_q5_K,\n                                                                     int *tile_x_sc_q5_K"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          125571
+    Length:          67
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          125643
+    Length:          73
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          125721
+    Length:          69
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          125936
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          125947
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          126044
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          126680
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          127820
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          128183
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          128795
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          128806
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          128896
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          129026
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          129550
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          129561
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          130421
+    Length:          38
+    ReplacementText: 'bq8_1[bq8_offset + 2*i].ds[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          130572
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          130583
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          130637
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          130676
+    Length:          0
+    ReplacementText: ",\n                                                                     int *tile_x_ql,\n                                                                     sycl::half2 *tile_x_dm,\n                                                                     int *tile_x_sc"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          130701
+    Length:          62
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          130768
+    Length:          68
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          130841
+    Length:          64
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          131036
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          131047
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          131144
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          131780
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          132523
+    Length:          32
+    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020, dpct::sub_sat())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          132601
+    Length:          32
+    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020, dpct::sub_sat())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          133031
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          133371
+    Length:          13
+    ReplacementText: 'sycl::min(i, i_max)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          133606
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          133617
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          133707
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          133837
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          134572
+    Length:          0
+    ReplacementText: "/*\nDPCT1110:8: The total declared local variable size in device function mul_mat_q exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          134579
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          134590
+    Length:          15
+    ReplacementText: __dpct_inline__
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          134837
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          134892
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_y_qs, sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          135212
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          135294
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          135354
+    Length:          47
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          135406
+    Length:          53
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          135710
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          135742
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          135875
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          136038
+    Length:          41
+    ReplacementText: 'dpct::min((unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i), ncols_y-1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          136263
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          136383
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          136546
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          136568
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          136642
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          136713
+    Length:          29
+    ReplacementText: 'sycl::min(col_y_0 + ids, ncols_y-1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          136867
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          136984
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          137237
+    Length:          20
+    ReplacementText: '(*dsi_src)[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          137292
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1118:9: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          137304
+    Length:          15
+    ReplacementText: "/*\n            DPCT1065:56: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n            */\n            item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          137814
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          137831
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          137907
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1118:10: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          137919
+    Length:          15
+    ReplacementText: "/*\n            DPCT1065:57: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n            */\n            item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          138058
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          138251
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          138973
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          139462
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0,\n    int *tile_y_qs, sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          139503
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          140376
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          140608
+    Length:          0
+    ReplacementText: ', tile_x_qs_q4_0, tile_x_d_q4_0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          140895
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          140905
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          142059
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          142278
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          142600
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1,\n    sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          142641
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          143513
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          143745
+    Length:          0
+    ReplacementText: ', tile_x_qs_q4_1, tile_x_dm_q4_1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          144032
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          144042
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          145195
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          145684
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0,\n    int *tile_y_qs, sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          145725
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          146597
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          146829
+    Length:          0
+    ReplacementText: ', tile_x_ql_q5_0, tile_x_d_q5_0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          147117
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          147127
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          148281
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          148766
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1,\n    sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          148807
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          149678
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          149910
+    Length:          0
+    ReplacementText: ', tile_x_ql_q5_1, tile_x_dm_q5_1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          150197
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          150207
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          151360
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          151849
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0,\n    int *tile_y_qs, sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          151890
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          152762
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          152994
+    Length:          0
+    ReplacementText: ', tile_x_qs_q8_0, tile_x_d_q8_0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          153282
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          153292
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          154447
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          154932
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K,\n    sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs,\n    sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          154973
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          155844
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          156076
+    Length:          0
+    ReplacementText: ', tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          156363
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          156373
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          157528
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          157747
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          158069
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K,\n    sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K,\n    int *tile_y_qs, sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          158110
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          158981
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          159213
+    Length:          0
+    ReplacementText: ', tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K, tile_x_sc_q3_K'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          159500
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          159510
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          160663
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          160882
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          161204
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K,\n    sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs,\n    sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          161245
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          162115
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          162347
+    Length:          0
+    ReplacementText: ', tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          162633
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          162643
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          163795
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          164280
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K,\n    sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs,\n    sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          164321
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          165191
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          165423
+    Length:          0
+    ReplacementText: ', tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          165709
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          165719
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          166870
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          167089
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          167411
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,\n    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          167452
+    Length:          5
+    ReplacementText: 'sycl::half2'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          168323
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          168555
+    Length:          0
+    ReplacementText: ', tile_x_ql, tile_x_dm, tile_x_sc'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          168842
+    Length:          0
+    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          168852
+    Length:          13
+    ReplacementText: DPCT_COMPATIBILITY_TEMP
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          169571
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          169719
+    Length:          0
+    ReplacementText: ",\n                          const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          169743
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          169754
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          169767
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          170198
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          170269
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          170378
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          170646
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          170705
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          170830
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          170988
+    Length:          0
+    ReplacementText: ",\n                                   const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          171120
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          171131
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          171144
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          171225
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          172594
+    Length:          3
+    ReplacementText: 'v.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          172645
+    Length:          3
+    ReplacementText: 'v.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          172852
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173048
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173262
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173277
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173295
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173330
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173341
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173354
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173391
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173402
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173415
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173670
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173719
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          173938
+    Length:          19
+    ReplacementText: 'sycl::vec<sycl::half, 1>(x[ix]).convert<float, sycl::rounding_mode::automatic>()[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174346
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174405
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174466
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174741
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174756
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174774
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174813
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174824
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174837
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174876
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174887
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          174900
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          175200
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          175249
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          175513
+    Length:          19
+    ReplacementText: 'sycl::vec<sycl::half, 1>(x[ix]).convert<float, sycl::rounding_mode::automatic>()[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          175696
+    Length:          42
+    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          175755
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          175816
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          175988
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176100
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176115
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176143
+    Length:          17
+    ReplacementText: 'sycl::vec<float, 1>(*xi).convert<sycl::half, sycl::rounding_mode::automatic>()[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176172
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176246
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176265
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176282
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176297
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176370
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176670
+    Length:          0
+    ReplacementText: ",\n                                   const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176692
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176703
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          176716
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          177406
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          177689
+    Length:          21
+    ReplacementText: 'sycl::fmax(amax, sycl::fabs((float)v))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          177917
+    Length:          10
+    ReplacementText: 'sycl::round((float)x0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          177945
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          178239
+    Length:          8
+    ReplacementText: 'sycl::fabs((float)v)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          178270
+    Length:          8
+    ReplacementText: 'sycl::fabs((float)v)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          178569
+    Length:          28
+    ReplacementText: 'dpct::min(15, (int8_t)(x0 + 8.5f))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          178627
+    Length:          28
+    ReplacementText: 'dpct::min(15, (int8_t)(x1 + 8.5f))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          178735
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          179187
+    Length:          10
+    ReplacementText: 'dsti->dm.x()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          179207
+    Length:          10
+    ReplacementText: 'dsti->dm.y()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          179404
+    Length:          28
+    ReplacementText: 'dpct::min(15, (int8_t)(x0 + 0.5f))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          179462
+    Length:          28
+    ReplacementText: 'dpct::min(15, (int8_t)(x1 + 0.5f))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          179611
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          179905
+    Length:          0
+    ReplacementText: ",\n                                 const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          179928
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          179939
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          179952
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          180465
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          180585
+    Length:          23
+    ReplacementText: 'sycl::max(0.001f, high - low)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          180628
+    Length:          23
+    ReplacementText: 'sycl::min(1.0f, sycl::max(0.0f, y))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          180878
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          181499
+    Length:          23
+    ReplacementText: 'sycl::log(1.0f / freq_scale)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          181547
+    Length:          11
+    ReplacementText: 'sycl::cos(theta)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          181586
+    Length:          11
+    ReplacementText: 'sycl::sin(theta)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          181700
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          181899
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          181926
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          181937
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          181950
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182032
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182043
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182056
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182214
+    Length:          34
+    ReplacementText: 'dpct::pow(freq_base, -float(col)/ncols)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182588
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182823
+    Length:          0
+    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182850
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182861
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182874
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182956
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182967
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          182980
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          183427
+    Length:          27
+    ReplacementText: 'dpct::pow(theta_scale, col/2.0f)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          183784
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          183947
+    Length:          0
+    ReplacementText: ', const sycl::nd_item<3> &item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          183971
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          183982
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          183995
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184119
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184130
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184143
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184263
+    Length:          32
+    ReplacementText: 'dpct::pow(freq_base, -2.0f*col/ncols)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184406
+    Length:          17
+    ReplacementText: 'sycl::min(p, n_ctx - 2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184480
+    Length:          11
+    ReplacementText: 'sycl::sin((float)theta)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184521
+    Length:          11
+    ReplacementText: 'sycl::cos((float)theta)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184759
+    Length:          21
+    ReplacementText: 'sycl::max(p - n_ctx - 2, 0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184833
+    Length:          17
+    ReplacementText: 'sycl::sin((float)block_theta)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          184886
+    Length:          17
+    ReplacementText: 'sycl::cos((float)block_theta)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185151
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185335
+    Length:          0
+    ReplacementText: ",\n                                 const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185359
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185370
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185383
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185464
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185475
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185488
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185631
+    Length:          15
+    ReplacementText: 'dpct::pow(m0, k + 1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185675
+    Length:          42
+    ReplacementText: 'dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185767
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185843
+    Length:          0
+    ReplacementText: ",\n                           const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185867
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185899
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          185973
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          186058
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          186150
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          186268
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          186345
+    Length:          0
+    ReplacementText: ",\n                              const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          186383
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          186410
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          186624
+    Length:          15
+    ReplacementText: "/*\n    DPCT1065:58: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n    */\n    item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187320
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1118:11: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187332
+    Length:          15
+    ReplacementText: "/*\n            DPCT1065:59: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n            */\n            item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187375
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187500
+    Length:          0
+    ReplacementText: ",\n                              const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187524
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187535
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187548
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187581
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187592
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187605
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          187993
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188122
+    Length:          0
+    ReplacementText: ",\n                         const sycl::nd_item<3> &item_ct1, float *buf"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188147
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188181
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188305
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188342
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188391
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188421
+    Length:          57
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188670
+    Length:          46
+    ReplacementText: 'sycl::max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188801
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188914
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1118:12: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          188922
+    Length:          15
+    ReplacementText: "/*\n        DPCT1065:60: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189014
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1118:13: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189022
+    Length:          15
+    ReplacementText: "/*\n        DPCT1065:61: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189113
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189311
+    Length:          50
+    ReplacementText: 'sycl::native::exp((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189483
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189590
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1118:14: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189598
+    Length:          15
+    ReplacementText: "/*\n        DPCT1065:62: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189686
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1118:15: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189694
+    Length:          15
+    ReplacementText: "/*\n        DPCT1065:63: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189773
+    Length:          0
+    ReplacementText: ', item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          189962
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190048
+    Length:          0
+    ReplacementText: ",\n                      const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190070
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190081
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190094
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190186
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190287
+    Length:          0
+    ReplacementText: ",\n                      const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190309
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190320
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190333
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190458
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190515
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190671
+    Length:          0
+    ReplacementText: ",\n        const sycl::nd_item<3> &item_ct1"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190693
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190707
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          190720
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191022
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191095
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191134
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191257
+    Length:          18
+    ReplacementText: 'sycl::vec<float, 1>(0.0f).convert<sycl::half, sycl::rounding_mode::automatic>()[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191325
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(0)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191378
+    Length:          44
+    ReplacementText: 'sycl::vec<float, 1>(x[offset_src + iih * IW + iiw]).convert<sycl::half, sycl::rounding_mode::automatic>()[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191670
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191738
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191754
+    Length:          30
+    ReplacementText: 1, 1, CUDA_GET_ROWS_BLOCK_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191897
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          191913
+    Length:          28
+    ReplacementText: 'ne11*ne12, ne10, block_num_x'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          192443
+    Length:          294
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) {\n        k_get_rows<qk, qr, dq>(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          192737
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          192985
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          193053
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          193069
+    Length:          30
+    ReplacementText: 1, 1, CUDA_GET_ROWS_BLOCK_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          193206
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          193222
+    Length:          28
+    ReplacementText: 'ne11*ne12, ne10, block_num_x'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          193719
+    Length:          288
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) {\n          k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          194007
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          194368
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          196599
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          196614
+    Length:          0
+    ReplacementText: '(1, 1, 1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          196628
+    Length:          12
+    ReplacementText: 'block_dims[2]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          196697
+    Length:          12
+    ReplacementText: 'block_dims[1]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          196712
+    Length:          54
+    ReplacementText: 'std::min<unsigned int>(ne1, block_size / (unsigned int)block_dims[2])'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          196780
+    Length:          12
+    ReplacementText: 'block_dims[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          196795
+    Length:          88
+    ReplacementText: 'std::min(std::min<unsigned int>(ne2*ne3, block_size / (unsigned int)block_dims[2] / (unsigned int)block_dims[1]), 64U)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          196898
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          196914
+    Length:          188
+    ReplacementText: '(ne2*ne3 + block_dims[0] - 1) / block_dims[0], (ne1 + block_dims[1] - 1) / block_dims[1], (hne0 + block_dims[2] - 1) / block_dims[2]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          197122
+    Length:          12
+    ReplacementText: 'block_nums[0]'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          197342
+    Length:          284
+    ReplacementText: "{\n                  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n                  stream->parallel_for(\n                    sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size), sycl::range<3>(1, 1, block_size)), \n                    [=](sycl::nd_item<3> item_ct1) {\n                      k_bin_bcast_unravel<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, s13, item_ct1);\n                    });\n                }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          197626
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          197649
+    Length:          0
+    ReplacementText: "                /*\n                DPCT1049:16: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n                */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          197665
+    Length:          277
+    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n                  stream->parallel_for(\n                    sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                    [=](sycl::nd_item<3> item_ct1) {\n                      k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, s13, item_ct1);\n                    });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          197942
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198176
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198286
+    Length:          114
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198400
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198474
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198583
+    Length:          68
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        gelu_f32(x, dst, k, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198651
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198725
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198834
+    Length:          68
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        silu_f32(x, dst, k, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198902
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          198982
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199091
+    Length:          74
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        gelu_quick_f32(x, dst, k, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199165
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199239
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199348
+    Length:          68
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        tanh_f32(x, dst, k, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199416
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199490
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199599
+    Length:          68
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        relu_f32(x, dst, k, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199667
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199775
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199884
+    Length:          90
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        leaky_relu_f32(x, dst, k, negative_slope, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          199974
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200047
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200154
+    Length:          66
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        sqr_f32(x, dst, k, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200220
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200332
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200434
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200450
+    Length:          15
+    ReplacementText: 1, 1, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200476
+    Length:          73
+    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200549
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200578
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200594
+    Length:          10
+    ReplacementText: 1, 1, 1024
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200607
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:17: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200615
+    Length:          68
+    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                norm_f32<1024>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200683
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200823
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200925
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200941
+    Length:          15
+    ReplacementText: 1, 1, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          200967
+    Length:          102
+    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            const float eps_ct4 = eps;\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                group_norm_f32<WARP_SIZE>(x, dst, group_size, ne_elements, eps_ct4, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201069
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201098
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201114
+    Length:          10
+    ReplacementText: 1, 1, 1024
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201127
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:18: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201135
+    Length:          97
+    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            const float eps_ct4 = eps;\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                group_norm_f32<1024>(x, dst, group_size, ne_elements, eps_ct4, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201232
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201361
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201470
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201483
+    Length:          20
+    ReplacementText: ne2, ne1, num_blocks
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201510
+    Length:          80
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        concat_f32(x, y, dst, ne0, ne02, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201590
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201726
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201874
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201887
+    Length:          39
+    ReplacementText: 'ne02, (ne01 * scale_factor), num_blocks'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          201933
+    Length:          101
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202034
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202195
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202298
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202311
+    Length:          20
+    ReplacementText: ne2, ne1, num_blocks
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202338
+    Length:          83
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202421
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202537
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202639
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202655
+    Length:          15
+    ReplacementText: 1, 1, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202681
+    Length:          77
+    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                rms_norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202758
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202787
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202803
+    Length:          10
+    ReplacementText: 1, 1, 1024
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202816
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:19: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202824
+    Length:          72
+    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          202896
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203019
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203151
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203167
+    Length:          18
+    ReplacementText: 1, ky, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203198
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203214
+    Length:          32
+    ReplacementText: 1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203253
+    Length:          74
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(num_blocks * block_size, block_size), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          quantize_q8_1(x, vy, kx, kx_padded, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203327
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203514
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203635
+    Length:          108
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203743
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203851
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203923
+    Length:          51
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q2_K(vx, y, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          203974
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204152
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204224
+    Length:          51
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q3_K(vx, y, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204275
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204453
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204509
+    Length:          51
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q4_K(vx, y, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204560
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204668
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204740
+    Length:          51
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q5_K(vx, y, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204791
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          204969
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          205041
+    Length:          51
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q6_K(vx, y, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          205092
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          207504
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          207771
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          207787
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          207817
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          207833
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          207869
+    Length:          127
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          207996
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208128
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208286
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208302
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208332
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208348
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208384
+    Length:          127
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208511
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208643
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208801
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208817
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208847
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208863
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          208899
+    Length:          127
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209026
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209158
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209316
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209332
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209362
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209378
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209414
+    Length:          127
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209541
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209673
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209831
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209847
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209877
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209893
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          209929
+    Length:          127
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210056
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210187
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210397
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210413
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210443
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210459
+    Length:          9
+    ReplacementText: 1, ny, 32
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210475
+    Length:          92
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210567
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210698
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210865
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210881
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210911
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210927
+    Length:          9
+    ReplacementText: 1, ny, 32
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          210943
+    Length:          92
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211035
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211166
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211333
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211349
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211379
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211395
+    Length:          9
+    ReplacementText: 1, ny, 32
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211411
+    Length:          92
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211503
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211634
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211703
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211719
+    Length:          8
+    ReplacementText: 1, 1, 32
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211734
+    Length:          80
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211814
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          211945
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212112
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212128
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212158
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212174
+    Length:          9
+    ReplacementText: 1, ny, 32
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212190
+    Length:          92
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212282
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212410
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212568
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212584
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212614
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212630
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212666
+    Length:          115
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212781
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          212906
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213053
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213069
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213099
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213115
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213151
+    Length:          153
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213304
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213429
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213576
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213592
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213622
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213638
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213674
+    Length:          153
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213827
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          213952
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214099
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214115
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214145
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214161
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214197
+    Length:          153
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214350
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214475
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214622
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214638
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214668
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214684
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214720
+    Length:          153
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214873
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          214998
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215145
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215161
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215191
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215207
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215243
+    Length:          153
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215396
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215521
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215667
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215683
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215713
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215729
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215765
+    Length:          152
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          215917
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216042
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216188
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216204
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216234
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216250
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216286
+    Length:          152
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216438
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216563
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216709
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216725
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216755
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216771
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216807
+    Length:          152
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          216959
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217084
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217230
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217246
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217276
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217292
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217328
+    Length:          152
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217480
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217605
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217751
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217767
+    Length:          17
+    ReplacementText: 1, 1, block_num_y
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217797
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217813
+    Length:          29
+    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          217849
+    Length:          152
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          218001
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          218199
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          218220
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          218250
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219170
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219186
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219226
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219242
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219337
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:20: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219345
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_0_acc_ct1.get_pointer(), tile_x_d_q4_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219481
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219534
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:21: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219542
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_0_acc_ct1.get_pointer(), tile_x_d_q4_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219678
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219687
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219882
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219903
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          219933
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          220853
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          220869
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          220909
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          220925
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221020
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:22: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221028
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) +     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_1_acc_ct1.get_pointer(), tile_x_dm_q4_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221164
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221217
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:23: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221225
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) +     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_1_acc_ct1.get_pointer(), tile_x_dm_q4_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221361
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221370
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221565
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221586
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          221616
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          222536
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          222552
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          222592
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          222608
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          222703
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:24: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          222711
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_0_acc_ct1.get_pointer(), tile_x_d_q5_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          222847
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          222900
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:25: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          222908
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_0_acc_ct1.get_pointer(), tile_x_d_q5_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          223044
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          223053
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          223248
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          223269
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          223299
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224219
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224235
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224275
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224291
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224386
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224394
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_1_acc_ct1.get_pointer(), tile_x_dm_q5_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224530
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224583
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224591
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_1_acc_ct1.get_pointer(), tile_x_dm_q5_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224727
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224736
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224931
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224952
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          224982
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          225902
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          225918
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          225958
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          225974
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226069
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226077
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q8_0_acc_ct1.get_pointer(), tile_x_d_q8_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226213
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226266
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:29: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226274
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q8_0_acc_ct1.get_pointer(), tile_x_d_q8_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226410
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226419
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226614
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226635
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          226665
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          227585
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          227601
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          227641
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          227657
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          227752
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:30: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          227760
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4)     + mmq_y/4), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q2_K_acc_ct1.get_pointer(), tile_x_dm_q2_K_acc_ct1.get_pointer(), tile_x_sc_q2_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          227896
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          227949
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:31: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          227957
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4)     + mmq_y/4), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q2_K_acc_ct1.get_pointer(), tile_x_dm_q2_K_acc_ct1.get_pointer(), tile_x_sc_q2_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          228093
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          228102
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          228297
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          228318
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          228365
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229285
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229301
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229341
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229357
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229452
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229460
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/2)     + mmq_y/2), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4)     + mmq_y/4), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q3_K_acc_ct1.get_pointer(), tile_x_dm_q3_K_acc_ct1.get_pointer(), tile_x_qh_q3_K_acc_ct1.get_pointer(), tile_x_sc_q3_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229596
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229649
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229657
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/2)     + mmq_y/2), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4)     + mmq_y/4), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q3_K_acc_ct1.get_pointer(), tile_x_dm_q3_K_acc_ct1.get_pointer(), tile_x_qh_q3_K_acc_ct1.get_pointer(), tile_x_sc_q3_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229793
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          229809
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          230004
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          230025
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          230055
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          230975
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          230991
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231031
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231047
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231142
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231150
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q4_K_acc_ct1.get_pointer(), tile_x_dm_q4_K_acc_ct1.get_pointer(), tile_x_sc_q4_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231286
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231339
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231347
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q4_K_acc_ct1.get_pointer(), tile_x_dm_q4_K_acc_ct1.get_pointer(), tile_x_sc_q4_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231483
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231492
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231687
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231708
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          231738
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          232658
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          232674
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          232714
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          232730
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          232825
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          232833
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_K_acc_ct1.get_pointer(), tile_x_dm_q5_K_acc_ct1.get_pointer(), tile_x_sc_q5_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          232969
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          233022
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          233030
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_K_acc_ct1.get_pointer(), tile_x_dm_q5_K_acc_ct1.get_pointer(), tile_x_sc_q5_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          233166
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          233175
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          233370
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          233391
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          233421
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234341
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234357
+    Length:          27
+    ReplacementText: 1, block_num_y, block_num_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234397
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234413
+    Length:          20
+    ReplacementText: 1, nwarps, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234508
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234516
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_acc_ct1.get_pointer(), tile_x_dm_acc_ct1.get_pointer(), tile_x_sc_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234652
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234705
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234713
+    Length:          136
+    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_acc_ct1.get_pointer(), tile_x_dm_acc_ct1.get_pointer(), tile_x_sc_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234849
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          234858
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235043
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235077
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235093
+    Length:          23
+    ReplacementText: nchannels_y, nrows_x, 1
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235129
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235145
+    Length:          15
+    ReplacementText: 1, 1, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235167
+    Length:          115
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235282
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235524
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235558
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235574
+    Length:          23
+    ReplacementText: nchannels_y, nrows_x, 1
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235610
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235626
+    Length:          15
+    ReplacementText: 1, 1, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235648
+    Length:          157
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          235805
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          236061
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          236170
+    Length:          157
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          236327
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          236583
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          236692
+    Length:          157
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          236849
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          237106
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          237207
+    Length:          148
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), \n      [=](sycl::nd_item<3> item_ct1) {\n        cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          237355
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          237612
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          237713
+    Length:          148
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), \n      [=](sycl::nd_item<3> item_ct1) {\n        cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          237861
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          238118
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          238219
+    Length:          148
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), \n      [=](sycl::nd_item<3> item_ct1) {\n        cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          238367
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          238623
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          238732
+    Length:          157
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          238889
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          238983
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239094
+    Length:          77
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        scale_f32(x, dst, scale, k, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239171
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239280
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239391
+    Length:          80
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        clamp_f32(x, dst, min, max, k, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239471
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239709
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239776
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239925
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239941
+    Length:          22
+    ReplacementText: 1, num_blocks_x, nrows
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          239992
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          240000
+    Length:          168
+    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->parallel_for(\n            sycl::nd_range<3>(block_nums * block_dims, block_dims), \n            [=](sycl::nd_item<3> item_ct1) {\n              rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, item_ct1);\n            });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          240168
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          240183
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          240191
+    Length:          167
+    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->parallel_for(\n            sycl::nd_range<3>(block_nums * block_dims, block_dims), \n            [=](sycl::nd_item<3> item_ct1) {\n              rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, item_ct1);\n            });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          240358
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          240619
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          240686
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          240835
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          240851
+    Length:          22
+    ReplacementText: 1, num_blocks_x, nrows
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241009
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241017
+    Length:          206
+    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->parallel_for(\n            sycl::nd_range<3>(block_nums * block_dims, block_dims), \n            [=](sycl::nd_item<3> item_ct1) {\n              rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, theta_scale, inv_ndims, item_ct1);\n            });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241223
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241238
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241246
+    Length:          205
+    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->parallel_for(\n            sycl::nd_range<3>(block_nums * block_dims, block_dims), \n            [=](sycl::nd_item<3> item_ct1) {\n              rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, theta_scale, inv_ndims, item_ct1);\n            });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241451
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241638
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241705
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241721
+    Length:          28
+    ReplacementText: '1, 1, CUDA_ROPE_BLOCK_SIZE/4'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241850
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241866
+    Length:          22
+    ReplacementText: 1, nrows, num_blocks_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          241895
+    Length:          115
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) {\n        rope_glm_f32(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242010
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242240
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242273
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242289
+    Length:          27
+    ReplacementText: 1, 1, CUDA_ALIBI_BLOCK_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242421
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242437
+    Length:          22
+    ReplacementText: 1, nrows, num_blocks_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242466
+    Length:          99
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) {\n        alibi_f32(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242565
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242664
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242697
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242713
+    Length:          15
+    ReplacementText: 1, 1, WARP_SIZE
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242741
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242775
+    Length:          68
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        k_sum_rows_f32(x, dst, ncols, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242843
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          242966
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243097
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243113
+    Length:          11
+    ReplacementText: 1, 1, ncols
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243137
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243201
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243209
+    Length:          86
+    ReplacementText: "stream->parallel_for(\n          sycl::nd_range<3>(block_nums * block_dims, block_dims), \n          [=](sycl::nd_item<3> item_ct1) {\n            k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);\n          });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243295
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243339
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243347
+    Length:          87
+    ReplacementText: "stream->parallel_for(\n          sycl::nd_range<3>(block_nums * block_dims, block_dims), \n          [=](sycl::nd_item<3> item_ct1) {\n            k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);\n          });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243434
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243635
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243668
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243839
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243855
+    Length:          23
+    ReplacementText: 1, block_num_x, nrows_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243885
+    Length:          99
+    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) {\n        diag_mask_inf_f32(x, dst, ncols_x, rows_per_channel, n_past, item_ct1);\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          243984
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244142
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244270
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244286
+    Length:          13
+    ReplacementText: 1, 1, nth
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244312
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244328
+    Length:          13
+    ReplacementText: 1, 1, nrows_x
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244344
+    Length:          0
+    ReplacementText: "    /*\n    DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n    */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244348
+    Length:          87
+    ReplacementText: "stream->submit(\n      [&](sycl::handler &cgh) {\n        /*\n        DPCT1101:96: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was replaced with a value. Modify the code to use the original expression, provided in comments, if it is correct.\n        */\n        sycl::local_accessor<float, 1> buf_acc_ct1(sycl::range<1>(32/*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);\n\n        cgh.parallel_for(\n          sycl::nd_range<3>(block_nums * block_dims, block_dims), \n          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n            soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1, buf_acc_ct1.get_pointer());\n          });\n      });"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244435
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244488
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244628
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244805
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244821
+    Length:          18
+    ReplacementText: IC, OH, num_blocks
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          244846
+    Length:          166
+    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1, item_ct1);\n        });\n    }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          245012
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          245872
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          245946
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          247185
+    Length:          43
+    ReplacementText: 'DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(look_ahead_size, dpct::get_in_order_queue()))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          247595
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          247658
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          247732
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          248074
+    Length:          13
+    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          248087
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          248125
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          248186
+    Length:          0
+    ReplacementText: "/*\nDPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.\n*/\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          248286
+    Length:          11
+    ReplacementText: 'dpct::device_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          248548
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          248622
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          249332
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1082:65: Migration of CUmemAllocationProp type is not supported.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          249511
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1082:66: Migration of CUmemGenericAllocationHandle type is not supported.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          249556
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1007:69: Migration of cuMemCreate is not supported.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          249729
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1007:70: Migration of cuMemAddressReserve is not supported.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          249877
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1007:71: Migration of cuMemMap is not supported.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          250001
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1082:72: Migration of CUmemAccessDesc type is not supported.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          250190
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1007:73: Migration of cuMemSetAccess is not supported.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          250981
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          251044
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          251118
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          251451
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          251524
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          251553
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          251749
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          251808
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          251837
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          251999
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          253084
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          253419
+    Length:          35
+    ReplacementText: 'DPCT_CHECK_ERROR(g_device_count = dpct::dev_mgr::instance().device_count())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          253458
+    Length:          11
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          254270
+    Length:          8
+    ReplacementText: int
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          254308
+    Length:          24
+    ReplacementText: 'DPCT_CHECK_ERROR(device = id)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          254335
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1028:74: The cuDeviceGetAttribute was not migrated because parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is unsupported.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          254488
+    Length:          0
+    ReplacementText: "                /*\n                DPCT1082:75: Migration of CUmemAllocationProp type is not supported.\n                */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          254723
+    Length:          0
+    ReplacementText: "                /*\n                DPCT1007:76: Migration of cuMemGetAllocationGranularity is not supported.\n                */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          254977
+    Length:          14
+    ReplacementText: 'dpct::device_info'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255021
+    Length:          23
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_device_info'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255045
+    Length:          5
+    ReplacementText: prop
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255052
+    Length:          2
+    ReplacementText: 'dpct::dev_mgr::instance().get_device(id)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255055
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255058
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1005:77: The SYCL device version is different from CUDA Compute Compatibility. You may need to rewrite this code.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255151
+    Length:          4
+    ReplacementText: 'get_name()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255162
+    Length:          5
+    ReplacementText: 'get_major_version()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255174
+    Length:          5
+    ReplacementText: 'get_minor_version()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255286
+    Length:          14
+    ReplacementText: 'get_global_mem_size()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255454
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1005:78: The SYCL device version is different from CUDA Compute Compatibility. You may need to rewrite this code.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255498
+    Length:          5
+    ReplacementText: 'get_major_version()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255514
+    Length:          5
+    ReplacementText: 'get_minor_version()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255906
+    Length:          0
+    ReplacementText: "                /*\n                DPCT1025:79: The SYCL queue is created ignoring the flag and priority options.\n                */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          255933
+    Length:          72
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[id][is] = dpct::get_current_device().create_queue())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          256084
+    Length:          35
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[id] = &dpct::get_in_order_queue())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          256122
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1027:80: The call to cublasSetMathMode was replaced with 0 because this functionality is redundant in SYCL.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          256147
+    Length:          67
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          256401
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257037
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257152
+    Length:          11
+    ReplacementText: 'dpct::err0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257170
+    Length:          36
+    ReplacementText: 'DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257208
+    Length:          0
+    ReplacementText: "    /*\n    DPCT1000:82: Error handling if-stmt was detected but could not be rewritten.\n    */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257223
+    Length:          11
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257265
+    Length:          28
+    ReplacementText: "        /*\n        DPCT1026:83: The call to cudaGetLastError was removed because this functionality is redundant in SYCL.\n        */\n        /*\n        DPCT1001:81: The statement could not be removed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257379
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1009:84: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257411
+    Length:          23
+    ReplacementText: '"cudaGetErrorString is not supported"/*cudaGetErrorString(err)*/'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257485
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257524
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257541
+    Length:          17
+    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257558
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257562
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257571
+    Length:          11
+    ReplacementText: 'dpct::err0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257713
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257734
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257741
+    Length:          14
+    ReplacementText: 'dpct::memcpy_direction'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          257841
+    Length:          22
+    ReplacementText: 'dpct::host_to_device'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          258114
+    Length:          24
+    ReplacementText: 'dpct::device_to_device'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          258253
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          258879
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          258918
+    Length:          6
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          258924
+    Length:          8
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          258933
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          258978
+    Length:          17
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::async_dpct_memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259050
+    Length:          0
+    ReplacementText: '*'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259057
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259323
+    Length:          11
+    ReplacementText: 'dpct::err0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259339
+    Length:          17
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::async_dpct_memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259395
+    Length:          0
+    ReplacementText: '*'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259402
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259404
+    Length:          0
+    ReplacementText: "            /*\n            DPCT1001:85: The statement could not be removed.\n            */\n            /*\n            DPCT1000:86: Error handling if-stmt was detected but could not be rewritten.\n            */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259425
+    Length:          11
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259473
+    Length:          11
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259493
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          259674
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          260139
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          261515
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          261845
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          261872
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          262015
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          262476
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          262832
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          263151
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          263965
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          264284
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          264604
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          265034
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          265470
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          265906
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          266336
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          266772
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          267310
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          267739
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          268320
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          268937
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          269563
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          270147
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          270721
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          271448
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          271471
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          271702
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          274068
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          276458
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          278619
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          281822
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          281845
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          282160
+    Length:          18
+    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          282780
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283141
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283196
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283260
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283624
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283679
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283743
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283794
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283831
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283875
+    Length:          45
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          283957
+    Length:          396
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::gemm(*g_cublas_handles[id], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00, src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, dst_f16.get(), dpct::library_data_t::real_half, ldc, dpct::library_data_t::real_half))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          285106
+    Length:          45
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          285188
+    Length:          244
+    ReplacementText: 'DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(*g_cublas_handles[id], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00, src1_ddf_i, ne10, dpct::get_value(&beta, *g_cublas_handles[id]), dst_dd_i, ldc))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          285515
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          285695
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          287917
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          287934
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          288500
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          288517
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          288959
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          290051
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          291125
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          291444
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          291957
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          292569
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          293193
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          293941
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          294213
+    Length:          0
+    ReplacementText: "    /*\n    DPCT1010:87: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n    */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          294228
+    Length:          18
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          294485
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          294834
+    Length:          0
+    ReplacementText: "    /*\n    DPCT1010:88: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n    */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          294849
+    Length:          18
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          295064
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          296196
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          297085
+    Length:          0
+    ReplacementText: "    /*\n    DPCT1010:89: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n    */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          297100
+    Length:          18
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          297204
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(main_stream->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          297256
+    Length:          24
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          297280
+    Length:          13
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          297294
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          297367
+    Length:          23
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          297400
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          298826
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          302402
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          303369
+    Length:          0
+    ReplacementText: "                /*\n                DPCT1010:90: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n                */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          303396
+    Length:          18
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          304041
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1024:91: The original code returned the error code that was further consumed by the program logic. This original code was replaced with 0. You may need to rewrite the program logic consuming the error code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          304060
+    Length:          86
+    ReplacementText: 'DPCT_CHECK_ERROR(*src0_extra->events[g_main_device][0] = g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          305030
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          305215
+    Length:          68
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier({*src0_extra->events[g_main_device][0]}))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          306734
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          306828
+    Length:          78
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          306906
+    Length:          8
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          306915
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          307176
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          307252
+    Length:          78
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          307330
+    Length:          8
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          307339
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          307967
+    Length:          0
+    ReplacementText: "                    /*\n                    DPCT1010:92: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n                    */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          307998
+    Length:          18
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          308505
+    Length:          0
+    ReplacementText: "                /*\n                DPCT1010:93: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n                */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          308532
+    Length:          18
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          308720
+    Length:          14
+    ReplacementText: 'dpct::memcpy_direction'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          308884
+    Length:          22
+    ReplacementText: 'dpct::device_to_host'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          309086
+    Length:          24
+    ReplacementText: 'dpct::device_to_device'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          310027
+    Length:          17
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::async_dpct_memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          310203
+    Length:          0
+    ReplacementText: '*'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          310210
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          310501
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          310566
+    Length:          6
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          310572
+    Length:          8
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          310581
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          310777
+    Length:          0
+    ReplacementText: "                    /*\n                    DPCT1024:94: The original code returned the error code that was further consumed by the program logic. This original code was replaced with 0. You may need to rewrite the program logic consuming the error code.\n                    */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          310808
+    Length:          51
+    ReplacementText: 'DPCT_CHECK_ERROR(*src0_extra->events[id][is] = stream->ext_oneapi_submit_barrier())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          312109
+    Length:          83
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier({*src0_extra->events[id][is]}))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          312346
+    Length:          23
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          312379
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          316249
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          316870
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          317481
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          317590
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          318161
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          318716
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          318774
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          318919
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          318928
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          318982
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          319008
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          319288
+    Length:          0
+    ReplacementText: ', const sycl::nd_item<3> &item_ct1'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          319310
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          319323
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          319336
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(2)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          319367
+    Length:          10
+    ReplacementText: 'item_ct1.get_group(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          319380
+    Length:          10
+    ReplacementText: 'item_ct1.get_local_range(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          319393
+    Length:          11
+    ReplacementText: 'item_ct1.get_local_id(1)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          319926
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          320948
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          321026
+    Length:          61
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          321237
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          321259
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          321748
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          321858
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          321896
+    Length:          19
+    ReplacementText: 'dpct::library_data_t'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          321958
+    Length:          14
+    ReplacementText: 'dpct::library_data_t'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          321996
+    Length:          10
+    ReplacementText: 'dpct::library_data_t::real_half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          322099
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          322133
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          322435
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          322481
+    Length:          4
+    ReplacementText: 'sycl::half'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          322608
+    Length:          10
+    ReplacementText: 'dpct::library_data_t::real_float'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          324031
+    Length:          613
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::gemm_batch(*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const char *) src0_as_f16, dpct::library_data_t::real_half, nb01/sizeof(sycl::half), src0->nb[2]/sizeof(sycl::half), (const char *) src1_as_f16.get(), dpct::library_data_t::real_half, nb11/sizeof(float), src1->nb[2]/sizeof(float), beta, (      char *)       dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), ne12*ne13, cu_compute_type))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          324853
+    Length:          4
+    ReplacementText: 'sycl::range<3>'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          324869
+    Length:          10
+    ReplacementText: 1, ne12, ne13
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          324882
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          324890
+    Length:          319
+    ReplacementText: "{\n          dpct::has_capability_or_fail(main_stream->get_device(), {sycl::aspect::fp16});\n\n          main_stream->submit(\n            [&](sycl::handler &cgh) {\n              const sycl::half * src1_as_f16_get_ct1 = src1_as_f16.get();\n              const void ** ptrs_src_get_ct3 = ptrs_src.get();\n              void ** ptrs_dst_get_ct4 = ptrs_dst.get();\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  k_compute_batched_ptrs(src0_as_f16, src1_as_f16_get_ct1, dst_t, ptrs_src_get_ct3, ptrs_dst_get_ct4, ne12, ne13, ne23, nb02, nb03, nb12, nb13, nbd2, nbd3, r2, r3, item_ct1);\n                });\n            });\n        }"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: true
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          325209
+    Length:          1
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          325211
+    Length:          0
+    ReplacementText: "        /*\n        DPCT1010:95: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n        */\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          325230
+    Length:          18
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          325282
+    Length:          499
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::gemm_batch(*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const void **) (ptrs_src.get() + 0*ne23), dpct::library_data_t::real_half, nb01/sizeof(sycl::half), (const void **) (ptrs_src.get() + 1*ne23), dpct::library_data_t::real_half, nb11/sizeof(float), beta, (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01, ne23, cu_compute_type))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          326000
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          336638
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          337017
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          337254
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          337312
+    Length:          24
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          337336
+    Length:          8
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          337345
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          337367
+    Length:          29
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          339747
+    Length:          14
+    ReplacementText: 'dpct::memcpy_direction'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          339822
+    Length:          22
+    ReplacementText: 'dpct::host_to_device'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          339847
+    Length:          24
+    ReplacementText: 'dpct::device_to_device'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          339887
+    Length:          14
+    ReplacementText: 'dpct::memcpy_direction'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          339962
+    Length:          22
+    ReplacementText: 'dpct::device_to_host'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          339987
+    Length:          24
+    ReplacementText: 'dpct::device_to_device'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          340538
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          340668
+    Length:          11
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          340679
+    Length:          8
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          340688
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          341614
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          341739
+    Length:          10
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          341749
+    Length:          8
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          341758
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          341888
+    Length:          29
+    ReplacementText: 'DPCT_CHECK_ERROR(stream->wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          341927
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          342360
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          343164
+    Length:          12
+    ReplacementText: 'dpct::queue_ptr'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          344994
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          347078
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          348753
+    Length:          22
+    ReplacementText: 'DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(size, dpct::get_in_order_queue()))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          348950
+    Length:          10
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memset'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          349006
+    Length:          0
+    ReplacementText: '.wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          349039
+    Length:          10
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          349078
+    Length:          24
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          349103
+    Length:          0
+    ReplacementText: '.wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          349281
+    Length:          72
+    ReplacementText: 'DPCT_CHECK_ERROR(extra->events[id][is] = new sycl::event())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          349415
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          349471
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          349874
+    Length:          32
+    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(extra->data_device[id], dpct::get_in_order_queue())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          349906
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          350108
+    Length:          39
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::destroy_event(extra->events[id][is]))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          350200
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          350900
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          353118
+    Length:          33
+    ReplacementText: 'DPCT_CHECK_ERROR(data = (char *)sycl::malloc_device(g_scratch_size, dpct::get_in_order_queue()))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          353512
+    Length:          23
+    ReplacementText: 'DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(size, dpct::get_in_order_queue()))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          353557
+    Length:          10
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memset'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          353582
+    Length:          0
+    ReplacementText: '.wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          353755
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          353838
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          353996
+    Length:          45
+    ReplacementText: 'DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(g_scratch_size, dpct::get_in_order_queue()))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          354826
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          354887
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          355134
+    Length:          10
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          355213
+    Length:          24
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          355238
+    Length:          0
+    ReplacementText: '.wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          355242
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          355840
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356198
+    Length:          14
+    ReplacementText: 'dpct::device_info'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356238
+    Length:          23
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_device_info'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356262
+    Length:          5
+    ReplacementText: prop
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356269
+    Length:          13
+    ReplacementText: 'dpct::dev_mgr::instance().get_device(g_main_device)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356283
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356385
+    Length:          4
+    ReplacementText: 'get_name()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356399
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356823
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356902
+    Length:          26
+    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(g_scratch_buffer, dpct::get_in_order_queue())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356928
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          356964
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362011
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362043
+    Length:          33
+    ReplacementText: 'DPCT_CHECK_ERROR(device_count = dpct::dev_mgr::instance().device_count())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362080
+    Length:          11
+    ReplacementText: '0'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362145
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362242
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362248
+    Length:          14
+    ReplacementText: 'dpct::device_info'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362284
+    Length:          23
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_device_info'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362308
+    Length:          5
+    ReplacementText: prop
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362315
+    Length:          6
+    ReplacementText: 'dpct::dev_mgr::instance().get_device(device)'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362322
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362380
+    Length:          4
+    ReplacementText: 'get_name()'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          362388
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          363487
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          363602
+    Length:          22
+    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          363624
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          363644
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          363953
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          364993
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365077
+    Length:          31
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365109
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365150
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365299
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365509
+    Length:          23
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365551
+    Length:          10
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365603
+    Length:          24
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365628
+    Length:          0
+    ReplacementText: '.wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365632
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365781
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          365991
+    Length:          23
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          366033
+    Length:          10
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          366091
+    Length:          24
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          366116
+    Length:          0
+    ReplacementText: '.wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          366120
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          366210
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          366365
+    Length:          23
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          366407
+    Length:          10
+    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memset'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          366452
+    Length:          0
+    ReplacementText: '.wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          366456
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          367132
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          367331
+    Length:          26
+    ReplacementText: 'DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(size, dpct::get_in_order_queue()))'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          367548
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          371821
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372103
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372160
+    Length:          24
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372184
+    Length:          36
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372221
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372225
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372367
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372649
+    Length:          15
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372712
+    Length:          24
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372736
+    Length:          36
+    ReplacementText: ''
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372773
+    Length:          0
+    ReplacementText: ')'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372777
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372845
+    Length:          0
+    ReplacementText: ' try '
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          372953
+    Length:          57
+    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait())'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Offset:          373036
+    Length:          0
+    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+MainSourceFilesDigest:
+  - MainSourceFile:  '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
+    Digest:          fe16d2da27d2d01e9e6dcb75ef2d0692
+DpctVersion:     18.0.0
+MainHelperFileName: ''
+USMLevel:        ''
+FeatureMap:      {}
+CompileTargets:  {}
+OptionMap:
+  AnalysisScopePath:
+    Value:           '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub'
+    Specified:       false
+  AsyncHandler:
+    Value:           'false'
+    Specified:       false
+  CommentsEnabled:
+    Value:           'false'
+    Specified:       false
+  CompilationsDir:
+    Value:           ''
+    Specified:       false
+  CtadEnabled:
+    Value:           'false'
+    Specified:       false
+  EnablepProfiling:
+    Value:           'true'
+    Specified:       true
+  ExperimentalFlag:
+    Value:           '0'
+    Specified:       false
+  ExplicitClNamespace:
+    Value:           'false'
+    Specified:       false
+  ExplicitNamespace:
+    Value:           '20'
+    Specified:       false
+  ExtensionDDFlag:
+    Value:           '0'
+    Specified:       false
+  ExtensionDEFlag:
+    Value:           '4294967295'
+    Specified:       false
+  HelperFuncPreferenceFlag:
+    Value:           '0'
+    Specified:       false
+  NDRangeDim:
+    Value:           '3'
+    Specified:       false
+  NoDRYPattern:
+    Value:           'false'
+    Specified:       false
+  NoUseGenericSpace:
+    Value:           ''
+    Specified:       true
+  OptimizeMigration:
+    Value:           'false'
+    Specified:       false
+  ProcessAll:
+    Value:           'false'
+    Specified:       false
+  RuleFile:
+    Value:           ''
+    Specified:       false
+  SyclNamedLambda:
+    Value:           'false'
+    Specified:       false
+  UsmLevel:
+    Value:           '1'
+    Specified:       false
+...
diff --git a/dpcpp_out2/ggml-alloc.h b/dpcpp_out2/ggml-alloc.h
new file mode 100644
index 0000000000000..64a412468915b
--- /dev/null
+++ b/dpcpp_out2/ggml-alloc.h
@@ -0,0 +1,92 @@
+#pragma once
+
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+struct ggml_backend;
+struct ggml_backend_buffer;
+struct ggml_backend_buffer_type;
+
+//
+// Legacy API
+//
+
+typedef struct ggml_allocr * ggml_allocr_t;
+
+// initialize allocator for use with CPU backend only
+GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
+GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
+
+// initialize allocator for use with ggml-backend
+GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
+GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
+GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
+
+GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
+
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void   ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
+
+GGML_API void   ggml_allocr_free       (ggml_allocr_t alloc);
+GGML_API bool   ggml_allocr_is_measure (ggml_allocr_t alloc);
+GGML_API void   ggml_allocr_reset      (ggml_allocr_t alloc);
+GGML_API void   ggml_allocr_alloc      (ggml_allocr_t alloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_allocr_max_size   (ggml_allocr_t alloc);
+
+GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
+
+//
+// ggml-backend v2 API
+//
+
+// Separate tensor and graph allocator objects
+// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
+// The original API is kept as a wrapper around the new API
+
+// Tensor allocator
+typedef struct ggml_tallocr * ggml_tallocr_t;
+
+GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
+GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
+GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
+GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
+GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
+
+GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
+
+GGML_API void   ggml_tallocr_free       (ggml_tallocr_t talloc);
+GGML_API bool   ggml_tallocr_is_measure (ggml_tallocr_t talloc);
+GGML_API void   ggml_tallocr_reset      (ggml_tallocr_t talloc);
+GGML_API void   ggml_tallocr_alloc      (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
+GGML_API size_t ggml_tallocr_max_size   (ggml_tallocr_t talloc);
+
+
+// Graph allocator
+typedef struct ggml_gallocr * ggml_gallocr_t;
+
+GGML_API ggml_gallocr_t ggml_gallocr_new(void);
+GGML_API void   ggml_gallocr_free(ggml_gallocr_t galloc);
+
+GGML_API void   ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
+GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
+
+// Allocate tensors from the allocators given by the hash table
+GGML_API void   ggml_gallocr_alloc_graph_n(
+                    ggml_gallocr_t galloc,
+                    struct ggml_cgraph * graph,
+                    struct ggml_hash_set hash_set,
+                    ggml_tallocr_t * hash_node_talloc);
+
+
+// Utils
+// Create a buffer and allocate all the tensors in a ggml_context
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/dpcpp_out2/ggml-backend-impl.h b/dpcpp_out2/ggml-backend-impl.h
new file mode 100644
index 0000000000000..05859935a3c2f
--- /dev/null
+++ b/dpcpp_out2/ggml-backend-impl.h
@@ -0,0 +1,116 @@
+#pragma once
+
+// ggml-backend internal header
+
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    //
+    // Backend buffer
+    //
+
+    // buffer type
+    typedef void * ggml_backend_buffer_type_context_t;
+
+    struct ggml_backend_buffer_type_i {
+        ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
+        size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
+        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
+        bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+        // check if tensor data is in host memory
+        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
+        bool                  (*is_host)         (ggml_backend_buffer_type_t buft);
+    };
+
+    struct ggml_backend_buffer_type {
+        struct ggml_backend_buffer_type_i  iface;
+        ggml_backend_buffer_type_context_t context;
+    };
+
+    // buffer
+    typedef void * ggml_backend_buffer_context_t;
+
+    struct ggml_backend_buffer_i {
+        void   (*free_buffer)    (ggml_backend_buffer_t buffer);
+        //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
+        void * (*get_base)       (ggml_backend_buffer_t buffer);
+        void   (*init_tensor)    (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+        void   (*set_tensor)     (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void   (*get_tensor)     (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+        // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
+        void   (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void   (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void   (*clear)          (ggml_backend_buffer_t buffer, uint8_t value);
+    };
+
+    struct ggml_backend_buffer {
+        struct ggml_backend_buffer_i  iface;
+        ggml_backend_buffer_type_t    buft;
+        ggml_backend_buffer_context_t context;
+        size_t size;
+    };
+
+    ggml_backend_buffer_t ggml_backend_buffer_init(
+                   ggml_backend_buffer_type_t      buft,
+            struct ggml_backend_buffer_i           iface,
+                   ggml_backend_buffer_context_t   context,
+                   size_t                          size);
+
+
+    //
+    // Backend
+    //
+
+    typedef void * ggml_backend_context_t;
+
+    struct ggml_backend_i {
+        const char * (*get_name)(ggml_backend_t backend);
+
+        void (*free)(ggml_backend_t backend);
+
+        // buffer allocation
+        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
+
+        // (optional) asynchroneous tensor data access
+        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+        // (optional) asynchroneous tensor copy
+        void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+        void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+        void (*synchronize)(ggml_backend_t backend);
+
+        // compute graph with a plan
+        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+        // compute graph without a plan
+        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+        // check if the backend supports an operation
+        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+    };
+
+    struct ggml_backend {
+        struct ggml_backend_i iface;
+
+        ggml_backend_context_t context;
+    };
+
+
+    //
+    // Backend registry
+    //
+
+    typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
+
+    void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/dpcpp_out2/ggml-backend.h b/dpcpp_out2/ggml-backend.h
new file mode 100644
index 0000000000000..a9d2fddd726a8
--- /dev/null
+++ b/dpcpp_out2/ggml-backend.h
@@ -0,0 +1,188 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-alloc.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+    typedef struct ggml_backend * ggml_backend_t;
+    typedef void * ggml_backend_graph_plan_t;
+
+    //
+    // Backend buffer
+    //
+
+    // buffer type
+    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
+    GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+    GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
+    GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
+    GGML_API bool ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
+
+    // buffer
+    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
+    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
+    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+    GGML_API void   ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
+    GGML_API bool   ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
+    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
+
+    //
+    // Backend
+    //
+
+
+    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+    GGML_API void         ggml_backend_free(ggml_backend_t backend);
+
+    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
+
+    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
+
+    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
+
+    // tensor copy between different backends
+    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
+
+    //
+    // CPU backend
+    //
+
+    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
+    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
+    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+
+    // Create a backend buffer from an existing pointer
+    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
+
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
+
+#ifdef GGML_USE_CPU_HBM
+    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
+#endif
+
+    //
+    // Backend registry
+    //
+
+    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
+
+    GGML_API size_t                     ggml_backend_reg_get_count(void);
+    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
+    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
+    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
+    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
+    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
+
+    //
+    // Backend scheduler
+    //
+
+    // The backend scheduler allows for multiple backends to be used together
+    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
+    // The backends are selected based on:
+    // - the backend that supports the operation
+    // - the location of the pre-allocated tensors (e.g. the weights)
+    /*
+      Example usage:
+
+        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
+        // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
+
+        // initialize buffers from a measure graph
+        measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
+
+        // in build_graph:
+        build_graph(...) {
+            // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
+            alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
+            ggml_allocr_alloc(alloc_cpu, tensor);
+
+            // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
+            struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
+            ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
+        }
+
+        // allocate backend buffers from measure graph
+        ggml_backend_sched_init_measure(sched, measure_graph);
+
+        // the scheduler is now ready to compute graphs
+
+        // compute
+        graph = build_graph(sched);
+        ggml_backend_sched_graph_compute(sched, graph);
+    */
+
+    struct ggml_backend_sched;
+    typedef struct ggml_backend_sched * ggml_backend_sched_t;
+
+    // Initialize a backend scheduler
+    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
+
+    GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
+
+    // Initialize backend buffers from a measure graph
+    GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+
+    GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
+    GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
+
+    GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+
+    // Allocate a graph on the backend scheduler
+    GGML_API void ggml_backend_sched_graph_compute(
+            ggml_backend_sched_t sched,
+            struct ggml_cgraph * graph);
+
+
+    //
+    // Utils
+    //
+
+    struct ggml_backend_graph_copy {
+        ggml_backend_buffer_t buffer;
+        struct ggml_context * ctx_allocated;
+        struct ggml_context * ctx_unallocated;
+        struct ggml_cgraph * graph;
+    };
+
+    // Copy a graph to a different backend
+    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
+    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
+
+    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
+
+    // Compare the output of two backends
+    GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
+
+    // Tensor initialization
+    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
+    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/dpcpp_out2/ggml-cuda.dp.cpp b/dpcpp_out2/ggml-cuda.dp.cpp
new file mode 100644
index 0000000000000..fc6c68cdcef01
--- /dev/null
+++ b/dpcpp_out2/ggml-cuda.dp.cpp
@@ -0,0 +1,12724 @@
+#define DPCT_PROFILING_ENABLED
+#define DPCT_COMPAT_RT_VERSION 12010
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <algorithm>
+#include <assert.h>
+#include <atomic>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <float.h>
+#include <limits>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include <dpct/blas_utils.hpp>
+
+#if defined(GGML_USE_HIPBLAS)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
+#define cublasCreate hipblasCreate
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#ifdef GGML_HIP_UMA
+#define cudaMalloc hipMallocManaged
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
+#else
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#endif
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#define __trap abort
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
+#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
+#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
+#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
+#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
+#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
+#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
+#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
+#else
+
+#if DPCT_COMPAT_RT_VERSION < 11020
+#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
+#define CUBLAS_COMPUTE_16F CUDA_R_16F
+#define CUBLAS_COMPUTE_32F CUDA_R_32F
+#define cublasComputeType_t cudaDataType_t
+#endif // CUDART_VERSION < 11020
+
+#endif // defined(GGML_USE_HIPBLAS)
+
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+#include <cmath>
+
+#include <dpct/lib_common_utils.hpp>
+
+#define MIN_CC_DP4A   510 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define CC_VOLTA      700
+#define CC_OFFSET_AMD 1000000
+#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
+
+#define GGML_CUDA_MAX_NODES 8192
+
+// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
+// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
+// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
+// -  7B quantum model: +100-200 MB
+// - 13B quantum model: +200-400 MB
+//
+//#define GGML_CUDA_FORCE_MMQ
+
+// TODO: improve this to be correct for more hardware
+//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
+//       probably other such cases, and not sure what happens on AMD hardware
+#if !defined(GGML_CUDA_FORCE_MMQ)
+#define CUDA_USE_TENSOR_CORES
+#endif
+
+// max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 32
+
+#if defined(GGML_USE_HIPBLAS)
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int&>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int&>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+    c = __builtin_amdgcn_sdot4(a, b, c, false);
+#elif defined(__gfx1100__)
+    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
+#elif defined(__gfx1010__) || defined(__gfx900__)
+    int tmp1;
+    int tmp2;
+    asm("\n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        "
+        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
+        : "v"(a), "v"(b)
+    );
+#else
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+    return c;
+}
+#endif // defined(GGML_USE_HIPBLAS)
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+#if DPCT_COMPAT_RT_VERSION >= 12000
+    static const char *cublas_get_error_str(const int err) {
+        /*
+        DPCT1009:48: SYCL uses exceptions to report errors and does not use the
+        error codes. The original code was commented out and a warning string
+        was inserted. You need to rewrite this code.
+        */
+        return "cublasGetStatusString is not supported" /*cublasGetStatusString(err)*/
+            ;
+    }
+#else
+    static const char * cublas_get_error_str(const cublasStatus_t err) {
+        switch (err) {
+            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
+            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
+            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
+            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
+            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
+            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
+            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
+            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
+            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
+            default: return "unknown error";
+        }
+    }
+#endif // CUDART_VERSION >= 12000
+
+[[noreturn]]
+static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
+    fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);
+    fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
+    GGML_ASSERT(!"CUDA error");
+}
+
+/*
+DPCT1001:50: The statement could not be removed.
+*/
+/*
+DPCT1000:51: Error handling if-stmt was detected but could not be rewritten.
+*/
+/*
+DPCT1009:52: SYCL uses exceptions to report errors and does not use the error
+codes. The original code was commented out and a warning string was inserted.
+You need to rewrite this code.
+*/
+#define CUDA_CHECK(err) do {                                                   \
+    auto err_ = (err); if (err_ != 0) ggml_cuda_error(                         \
+        #err, __func__, __FILE__, __LINE__,                                    \
+        "cudaGetErrorString is not supported" /*cudaGetErrorString(err_)*/);   \
+} while (0)
+#define CUBLAS_CHECK(err)                                                      \
+    do { auto err_ = (err); if (err_ != 0)                                     \
+             ggml_cuda_error(#err, __func__, __FILE__, __LINE__,               \
+                             cublas_get_error_str(err_)); } while (0)
+
+#if !defined(GGML_USE_HIPBLAS)
+static const char *cu_get_error_str(int err) {
+    const char * err_str;
+    /*
+    DPCT1007:49: Migration of cuGetErrorString is not supported.
+    */
+    cuGetErrorString(err, &err_str);
+    return err_str;
+}
+/*
+DPCT1001:67: The statement could not be removed.
+*/
+/*
+DPCT1000:68: Error handling if-stmt was detected but could not be rewritten.
+*/
+#define CU_CHECK(err)                                                          \
+    do { auto err_ = (err);                                                    \
+         if (err_ != 0) ggml_cuda_error(#err, __func__, __FILE__, __LINE__,    \
+                                        cu_get_error_str(err_)); } while (0)
+#endif
+
+#if DPCT_COMPAT_RT_VERSION >= 11100
+#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_CUDA_ASSUME(x)
+#endif // CUDART_VERSION >= 11100
+
+#ifdef GGML_CUDA_F16
+typedef half dfloat; // dequantize float
+typedef half2 dfloat2;
+#else
+typedef float dfloat; // dequantize float
+typedef sycl::float2 dfloat2;
+#endif //GGML_CUDA_F16
+
+static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __dpct_inline__ int get_int_from_uint8(const uint8_t *x8,
+                                              const int &i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __dpct_inline__ int get_int_from_int8_aligned(const int8_t *x8,
+                                                     const int &i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8,
+                                                      const int &i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+template <typename T>
+using to_t_cuda_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
+                             int k, dpct::queue_ptr stream);
+typedef to_t_cuda_t<float> to_fp32_cuda_t;
+typedef to_t_cuda_t<sycl::half> to_fp16_cuda_t;
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
+typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
+typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
+typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_cuda_op_mul_mat_t)(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream);
+typedef void (*ggml_cuda_op_flatten_t)(const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst, const float *src0_dd,
+                                       const float *src1_dd, float *dst_dd,
+                                       const dpct::queue_ptr &main_stream);
+
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization
+// QI = number of 32 bit integers before dequantization
+
+#define QK4_0 32
+#define QR4_0 2
+#define QI4_0 (QK4_0 / (4 * QR4_0))
+typedef struct dpct_type_471834 {
+    sycl::half d;           // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+#define QR4_1 2
+#define QI4_1 (QK4_1 / (4 * QR4_1))
+typedef struct dpct_type_143705 {
+    sycl::half2 dm;         // dm.x = delta, dm.y = min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+#define QR5_0 2
+#define QI5_0 (QK5_0 / (4 * QR5_0))
+typedef struct dpct_type_673649 {
+    sycl::half d;           // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+#define QR5_1 2
+#define QI5_1 (QK5_1 / (4 * QR5_1))
+typedef struct dpct_type_135589 {
+    sycl::half2 dm;         // dm.x = delta, dm.y = min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+#define QR8_0 1
+#define QI8_0 (QK8_0 / (4 * QR8_0))
+typedef struct dpct_type_122878 {
+    sycl::half d;           // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+#define QR8_1 1
+#define QI8_1 (QK8_1 / (4 * QR8_1))
+typedef struct dpct_type_143721 {
+    sycl::half2 ds;         // ds.x = delta, ds.y = sum
+    int8_t  qs[QK8_0];      // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
+
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef void (*allocate_tiles_cuda_t)(int **x_ql, sycl::half2 **x_dm,
+                                      int **x_qh, int **x_sc);
+typedef void (*load_tiles_cuda_t)(const void *__restrict__ vx,
+                                  int *__restrict__ x_ql,
+                                  sycl::half2 *__restrict__ x_dm,
+                                  int *__restrict__ x_qh,
+                                  int *__restrict__ x_sc, const int &i_offset,
+                                  const int &i_max, const int &k,
+                                  const int &blocks_per_row);
+typedef float (*vec_dot_q_mul_mat_cuda_t)(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
+    const int &i, const int &j, const int &k);
+
+//================================= k-quants
+
+#ifdef GGML_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
+typedef struct dpct_type_619598 {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    sycl::half2 dm;          // super-block scale for quantized scales/mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
+typedef struct dpct_type_138576 {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#ifdef GGML_QKK_64
+    uint8_t scales[2]; // scales, quantized with 8 bits
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    sycl::half d; // super-block scale
+} block_q3_K;
+//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
+
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
+#ifdef GGML_QKK_64
+typedef struct {
+    half    dm[2];             // super-block scales/mins
+    uint8_t scales[2];         // 4-bit block scales/mins
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
+#else
+typedef struct dpct_type_154943 {
+    sycl::half2 dm;            // super-block scale for quantized scales/mins
+    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
+#endif
+
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
+#ifdef GGML_QKK_64
+typedef struct {
+    half d;                  // super-block scale
+    int8_t scales[QK_K/16];  // block scales
+    uint8_t qh[QK_K/8];      // quants, high bit
+    uint8_t qs[QK_K/2];      // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
+#else
+typedef struct dpct_type_866817 {
+    sycl::half2 dm;               // super-block scale for quantized scales/mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // quants, high bit
+    uint8_t qs[QK_K/2];           // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+#endif
+
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
+typedef struct dpct_type_107281 {
+    uint8_t ql[QK_K/2];   // quants, lower 4 bits
+    uint8_t qh[QK_K/4];   // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales
+    sycl::half d;            // delta
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
+
+#define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
+
+#define CUDA_GELU_BLOCK_SIZE 256
+#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_TANH_BLOCK_SIZE 256
+#define CUDA_RELU_BLOCK_SIZE 256
+#define CUDA_SQR_BLOCK_SIZE 256
+#define CUDA_CPY_BLOCK_SIZE 32
+#define CUDA_SCALE_BLOCK_SIZE 256
+#define CUDA_CLAMP_BLOCK_SIZE 256
+#define CUDA_ROPE_BLOCK_SIZE 256
+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
+#define CUDA_ALIBI_BLOCK_SIZE 32
+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
+#define CUDA_QUANTIZE_BLOCK_SIZE 256
+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
+#define CUDA_UPSCALE_BLOCK_SIZE 256
+#define CUDA_CONCAT_BLOCK_SIZE 256
+#define CUDA_PAD_BLOCK_SIZE 256
+#define CUDA_ACC_BLOCK_SIZE 256
+#define CUDA_IM2COL_BLOCK_SIZE 256
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_CUDA_DMMV_X
+#define GGML_CUDA_DMMV_X 32
+#endif
+#ifndef GGML_CUDA_MMV_Y
+#define GGML_CUDA_MMV_Y 1
+#endif
+
+#ifndef K_QUANTS_PER_ITERATION
+#define K_QUANTS_PER_ITERATION 2
+#else
+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
+#endif
+
+#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
+#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
+#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+#define MUL_MAT_SRC1_COL_STRIDE 128
+
+#define MAX_STREAMS 8
+static dpct::queue_ptr g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = {
+    {&dpct::get_in_order_queue()}};
+
+struct ggml_tensor_extra_gpu {
+    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+    dpct::event_ptr
+        events[GGML_CUDA_MAX_DEVICES]
+              [MAX_STREAMS]; // events for synchronizing multiple GPUs
+};
+
+// this is faster on Windows
+// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
+inline dpct::err0 ggml_cuda_set_device(const int device) try {
+    int current_device;
+    CUDA_CHECK(DPCT_CHECK_ERROR(
+        current_device = dpct::dev_mgr::instance().current_device_id()));
+
+    if (device == current_device) {
+        return 0;
+    }
+
+    /*
+    DPCT1093:53: The "device" device may be not the one intended for use. Adjust
+    the selected device if needed.
+    */
+    return DPCT_CHECK_ERROR(dpct::select_device(device));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static int g_device_count = -1;
+static int g_main_device = 0;
+static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+
+struct cuda_device_capabilities {
+    int     cc;                 // compute capability
+    bool    vmm;                // virtual memory support
+    size_t  vmm_granularity;    // granularity of virtual memory
+};
+
+static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
+
+
+static void * g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 0; // disabled by default
+static size_t g_scratch_offset = 0;
+
+static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+[[noreturn]]
+static void bad_arch(const sycl::stream &stream_ct1) {
+    stream_ct1 << "ERROR: ggml-cuda was compiled without support for the "
+                  "current GPU architecture.\n";
+    __trap();
+
+    (void) bad_arch; // suppress unused function warning
+}
+
+static __dpct_inline__ float warp_reduce_sum(float x,
+                                             const sycl::nd_item<3> &item_ct1) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1096:98: The right-most dimension of the work-group used in the SYCL
+        kernel that calls this function may be less than "32". The function
+        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
+        CPU device. Modify the size of the work-group to ensure that the value
+        of the right-most dimension is a multiple of "32".
+        */
+        x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
+    }
+    return x;
+}
+
+static __dpct_inline__ sycl::float2
+warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
+                                                mask);
+        a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
+                                                mask);
+    }
+    return a;
+}
+
+static __dpct_inline__ float warp_reduce_max(float x,
+                                             const sycl::nd_item<3> &item_ct1) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1096:97: The right-most dimension of the work-group used in the SYCL
+        kernel that calls this function may be less than "32". The function
+        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
+        CPU device. Modify the size of the work-group to ensure that the value
+        of the right-most dimension is a multiple of "32".
+        */
+        x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
+                              item_ct1.get_sub_group(), x, mask));
+    }
+    return x;
+}
+
+static __dpct_inline__ float op_repeat(const float a, const float b) {
+    return b;
+}
+
+static __dpct_inline__ float op_add(const float a, const float b) {
+    return a + b;
+}
+
+static __dpct_inline__ float op_mul(const float a, const float b) {
+    return a * b;
+}
+
+static __dpct_inline__ float op_div(const float a, const float b) {
+    return a / b;
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1));
+    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) /
+                   ne3;
+    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) %
+                   ne3;
+
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0;
+         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
+        const int i10 = i0 % ne10;
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+    }
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    const int i3 = i/(ne2*ne1*ne0);
+    const int i2 = (i/(ne1*ne0)) % ne2;
+    const int i1 = (i/ne0) % ne1;
+    const int i0 = i % ne0;
+
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = i0 % ne10;
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+}
+
+static void acc_f32(const float * x, const float * y, float * dst, const int ne,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= ne) {
+        return;
+    }
+    int src1_idx = i - offset;
+    int oz = src1_idx / nb2;
+    int oy = (src1_idx - (oz * nb2)) / nb1;
+    int ox = src1_idx % nb1;
+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
+    } else {
+        dst[i] = x[i];
+    }
+}
+
+static void gelu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    float xi = x[i];
+    dst[i] = 0.5f * xi *
+             (1.0f +
+              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
+}
+
+static void silu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
+}
+
+static void gelu_quick_f32(const float *x, float *dst, int k,
+                           const sycl::nd_item<3> &item_ct1) {
+    const float GELU_QUICK_COEF = -1.702f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
+}
+
+static void tanh_f32(const float *x, float *dst, int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::tanh((float)(x[i]));
+}
+
+static void relu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0);
+}
+
+static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
+             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
+}
+
+static void sqr_f32(const float * x, float * dst, const int k,
+                    const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * x[i];
+}
+
+template <int block_size>
+static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
+                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    const int tid = item_ct1.get_local_id(2);
+
+    sycl::float2 mean_var = sycl::float2(0.f, 0.f);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        mean_var.x() += xi;
+        mean_var.y() += xi * xi;
+    }
+
+    // sum up partial sums
+    mean_var = warp_reduce_sum(mean_var, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        /*
+        DPCT1118:0: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        mean_var = s_sum[lane_id];
+        mean_var = warp_reduce_sum(mean_var, item_ct1);
+    }
+
+    const float mean = mean_var.x() / ncols;
+    const float var = mean_var.y() / ncols - mean * mean;
+    const float inv_std = sycl::rsqrt(var + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
+    }
+}
+
+static void concat_f32(const float  *x,const float  *y, float *dst, const int ne0, const int ne02,
+                       const sycl::nd_item<3> &item_ct1) {
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+    // operation
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    if (item_ct1.get_group(0) < ne02) { // src0
+        int offset_src =
+            nidx + item_ct1.get_group(1) * ne0 +
+            item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+            dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            nidx + item_ct1.get_group(1) * ne0 +
+            (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
+            dst[offset_dst] = y[offset_src];
+    }
+}
+
+static void upscale_f32(const float  *x, float *dst, const int ne00, const int nb02, const int scale_factor,
+                        const sycl::nd_item<3> &item_ct1) {
+    int ne0 = ne00 * scale_factor;
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+    // operation
+    int i00 = nidx / scale_factor;
+    int i01 = item_ct1.get_group(1) / scale_factor;
+    int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    dst[offset_dst] = x[offset_src];
+}
+
+static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
+                    const sycl::nd_item<3> &item_ct1) {
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
+        item_ct1.get_group(0) < ne02) {
+        int offset_src = nidx + item_ct1.get_group(1) * ne00 +
+                         item_ct1.get_group(0) * ne00 * ne01;
+            dst[offset_dst] = x[offset_src];
+    } else {
+        dst[offset_dst] = 0.0f;
+    }
+}
+
+template <int block_size>
+static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
+                           const sycl::nd_item<3> &item_ct1, float *s_sum) {
+    int start = item_ct1.get_group(2) * group_size;
+    int end = start + group_size;
+
+    start += item_ct1.get_local_id(2);
+
+    if (end >= ne_elements) {
+        end = ne_elements;
+    }
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int j = start; j < end; j += block_size) {
+        tmp += x[j];
+    }
+
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:1: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    float mean = tmp / group_size;
+    tmp = 0.0f;
+
+    for (int j = start; j < end; j += block_size) {
+        float xi = x[j] - mean;
+        dst[j] = xi;
+        tmp += xi * xi;
+    }
+
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:2: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    float variance = tmp / group_size;
+    float scale = sycl::rsqrt(variance + eps);
+    for (int j = start; j < end; j += block_size) {
+        dst[j] *= scale;
+    }
+}
+
+template <int block_size>
+static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
+                         const sycl::nd_item<3> &item_ct1, float *s_sum) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    const int tid = item_ct1.get_local_id(2);
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        tmp += xi * xi;
+    }
+
+    // sum up partial sums
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:3: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    const float mean = tmp / ncols;
+    const float scale = sycl::rsqrt(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = scale * x[row*ncols + col];
+    }
+}
+
+static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {8.0f, 8.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x() = (v.x() - 8.0f) * d;
+    v.y() = (v.y() - 8.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = x[ib].dm[0];
+    const dfloat m = x[ib].dm[1];
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x() = (v.x() * d) + m;
+    v.y() = (v.y() * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {16.0f, 16.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x() = (v.x() - 16.0f) * d;
+    v.y() = (v.y() - 16.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = x[ib].dm[0];
+    const dfloat m = x[ib].dm[1];
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x() = (v.x() * d) + m;
+    v.y() = (v.y() * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x() = x[ib].qs[iqs + 0];
+    v.y() = x[ib].qs[iqs + 1];
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+#else
+    v.x() *= d;
+    v.y() *= d;
+#endif // GGML_CUDA_F16
+}
+
+//================================== k-quants
+
+template<typename dst_t>
+static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
+    const int is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    float dall = x[i].dm[0];
+    float dmin = x[i].dm[1];
+    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
+    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
+    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
+#else
+    const int is = tid/16;  // 0 or 1
+    const int il = tid%16;  // 0...15
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    dst_t * y = yy + i*QK_K + 16*is + il;
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
+    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
+#endif
+
+}
+
+template<typename dst_t>
+static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+#if QK_K == 256
+    const int r = item_ct1.get_local_id(2) / 4;
+    const int tid = r/2;
+    const int is0 = r%2;
+    const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
+    const int n = tid / 4;
+    const int j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    float d_all = x[i].d;
+    float dl = d_all * (us - 32);
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+#else
+    const int tid = threadIdx.x;
+    const int is  = tid/16;  // 0 or 1
+    const int il  = tid%16;  // 0...15
+    const int im  = il/8;    // 0...1
+    const int in  = il%8;    // 0...7
+
+    dst_t * y = yy + i*QK_K + 16*is + il;
+
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    const uint8_t h = x[i].hmask[in] >> (2*is + im);
+    const float   d = (float)x[i].d;
+
+    if (is == 0) {
+        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    } else {
+        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    }
+#endif
+
+}
+
+#if QK_K == 256
+static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+#endif
+
+template<typename dst_t>
+static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 32 threads
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const float dall = x[i].dm[0];
+    const float dmin = x[i].dm[1];
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l +32] = d2 * (q[l] >>  4) - m2;
+    }
+#else
+    const int tid = threadIdx.x;
+    const uint8_t * q = x[i].qs;
+    dst_t * y = yy + i*QK_K;
+    const float d = (float)x[i].dm[0];
+    const float m = (float)x[i].dm[1];
+    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
+    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = x[i].dm[0];
+    const float dmin = x[i].dm[1];
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+#else
+    const int tid = threadIdx.x;
+    const uint8_t q = x[i].qs[tid];
+    const int im = tid/8;  // 0...3
+    const int in = tid%8;  // 0...7
+    const int is = tid/16; // 0 or 1
+    const uint8_t h = x[i].qh[in] >> im;
+    const float d = x[i].d;
+    dst_t * y = yy + i*QK_K + tid;
+    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
+    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+#if QK_K == 256
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = item_ct1.get_local_id(2);
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+#else
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int ip  = tid/16;         // 0 or 1
+    const int il  = tid - 16*ip;    // 0...15
+
+    dst_t * y = yy + i*QK_K + 16*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t   ql = x[i].ql[16*ip + il];
+    const uint8_t   qh = x[i].qh[il] >> (2*ip);
+    const int8_t  * sc = x[i].scales;
+
+    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+#endif
+}
+
+/*
+DPCT1110:4: The total declared local variable size in device function
+dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q2_K * x = (const block_q2_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
+    const int q_offset = 32*im + l0;
+    const int s_offset = 8*im;
+    const int y_offset = 128*im + l0;
+
+    uint32_t aux[4];
+    const uint8_t * d = (const uint8_t *)aux;
+    const uint8_t * m = (const uint8_t *)(aux + 2);
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+
+        const float dall = x[i].dm[0];
+        const float dmin = x[i].dm[1];
+
+        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
+        aux[0] = a[0] & 0x0f0f0f0f;
+        aux[1] = a[1] & 0x0f0f0f0f;
+        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
+        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
+            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
+
+        }
+        tmp += dall * sum1 - dmin * sum2;
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;
+
+    uint32_t uaux[2];
+    const uint8_t * d = (const uint8_t *)uaux;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint32_t * s = (const uint32_t *)x[i].scales;
+
+        uaux[0] = s[0] & 0x0f0f0f0f;
+        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
+
+        const float2 dall = __half22float2(x[i].dm);
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t ql = q[l];
+            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
+                  + y[l+16] * d[1] * ((ql >> 2) & 3)
+                  + y[l+32] * d[2] * ((ql >> 4) & 3)
+                  + y[l+48] * d[3] * ((ql >> 6) & 3);
+            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
+        }
+        tmp += dall.x * sum1 - dall.y * sum2;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:5: The total declared local variable size in device function
+dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q3_K * x = (const block_q3_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+
+    const uint16_t kmask1 = 0x0303;
+    const uint16_t kmask2 = 0x0f0f;
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
+    const int step = 16/K_QUANTS_PER_ITERATION;
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0....15 or 0...7
+
+    const uint8_t m = 1 << (4*im);
+
+    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
+    const int q_offset =  32*im + l0;
+    const int y_offset = 128*im + l0;
+
+    uint16_t utmp[4];
+    const int8_t * s = (const int8_t *)utmp;
+
+    const uint16_t s_shift = 4*im;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+        const uint8_t * h = x[i].hmask + l0;
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
+        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
+
+        const float d = x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < n; ++l) {
+            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
+            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
+        }
+        tmp += d * sum;
+
+    }
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
+    const int in = offset/8;                                 // 0 or 1
+    const int im = offset%8;                                 // 0...7
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint8_t * s = x[i].scales;
+
+        const float dall = (float)x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t hl = x[i].hmask[im+l] >> in;
+            const uint8_t ql = q[l];
+            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
+                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
+                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
+                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:6: The total declared local variable size in device function
+dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q4_K * x = (const block_q4_K *)vx + ib0;
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
+
+    const int il  = tid/step;                            // 0...3
+    const int ir  = tid - step*il;                       // 0...7 or 0...3
+    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+#if K_QUANTS_PER_ITERATION == 2
+    uint32_t q32[4];
+    const uint8_t * q4 = (const uint8_t *)q32;
+#else
+    uint16_t q16[4];
+    const uint8_t * q4 = (const uint8_t *)q16;
+#endif
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y1 = yy + i*QK_K + y_offset;
+        const float   * y2 = y1 + 128;
+
+        const float dall = x[i].dm[0];
+        const float dmin = x[i].dm[1];
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+#if K_QUANTS_PER_ITERATION == 2
+        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
+        const uint32_t * q2 = q1 + 16;
+
+        q32[0] = q1[0] & 0x0f0f0f0f;
+        q32[1] = q1[0] & 0xf0f0f0f0;
+        q32[2] = q2[0] & 0x0f0f0f0f;
+        q32[3] = q2[0] & 0xf0f0f0f0;
+
+        sycl::float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 4; ++l) {
+            s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4];
+            s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
+                       s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
+               dmin * smin;
+#else
+        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
+        const uint16_t * q2 = q1 + 32;
+
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[0] & 0xf0f0;
+        q16[2] = q2[0] & 0x0f0f;
+        q16[3] = q2[0] & 0xf0f0;
+
+        float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 2; ++l) {
+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
+            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
+#endif
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    float tmp = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const float   * y = yy + i*QK_K + step;
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+        const float d = (float)x[i].dm[0];
+        const float m = (float)x[i].dm[1];
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
+                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
+                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
+                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
+        }
+        tmp += sum;
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:7: The total declared local variable size in device function
+dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2);
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q5_K * x = (const block_q5_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = item_ct1.get_local_id(2) / 2; // 0...15
+    const int ix = item_ct1.get_local_id(2) % 2;
+
+    const int il  = tid/4;     // 0...3
+    const int ir  = tid - 4*il;// 0...3
+    const int n   = 2;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1  = 1 << (2*im);
+    const uint8_t hm2  = hm1 << 4;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    uint16_t q16[8];
+    const uint8_t * q4 = (const uint8_t *)q16;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2) {
+
+        const uint8_t * ql1 = x[i].qs + q_offset;
+        const uint8_t * qh  = x[i].qh + l0;
+        const float   * y1  = yy + i*QK_K + y_offset;
+        const float   * y2  = y1 + 128;
+
+        const float dall = x[i].dm[0];
+        const float dmin = x[i].dm[1];
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        const uint16_t * q1 = (const uint16_t *)ql1;
+        const uint16_t * q2 = q1 + 32;
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[8] & 0x0f0f;
+        q16[2] = (q1[0] >> 4) & 0x0f0f;
+        q16[3] = (q1[8] >> 4) & 0x0f0f;
+        q16[4] = q2[0] & 0x0f0f;
+        q16[5] = q2[8] & 0x0f0f;
+        q16[6] = (q2[0] >> 4) & 0x0f0f;
+        q16[7] = (q2[8] >> 4) & 0x0f0f;
+        for (int l = 0; l < n; ++l) {
+            sum.x() +=
+                y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
+                y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
+            sum.y() +=
+                y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
+                y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
+            sum.z() +=
+                y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
+                y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
+            sum.w() +=
+                y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
+                y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
+        }
+        tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
+                       sum.w() * sc[5]) -
+               dmin * smin;
+    }
+
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+    const int step = tid * K_QUANTS_PER_ITERATION;
+    const int im = step/8;
+    const int in = step%8;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const int8_t  * s = x[i].scales;
+        const float   * y = yy + i*QK_K + step;
+        const float     d = x[i].d;
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            const uint8_t h = x[i].qh[in+j] >> im;
+            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
+                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
+                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
+                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q6_K * x = (const block_q6_K *)vx + ib0;
+
+#if QK_K == 256
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+#if K_QUANTS_PER_ITERATION == 1
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
+    const int is = 0;
+#else
+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
+    const int is = in / 4;
+#endif
+    const int ql_offset = 64*im + l0;
+    const int qh_offset = 32*im + l0;
+    const int s_offset  =  8*im + is;
+    const int y_offset = 128*im + l0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * ql = x[i].ql + ql_offset;
+        const uint8_t * qh = x[i].qh + qh_offset;
+        const int8_t  * s  = x[i].scales + s_offset;
+
+        const float d = x[i].d;
+
+#if K_QUANTS_PER_ITERATION == 1
+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
+        tmp += sum;
+#else
+        float sum = 0;
+        for (int l = 0; l < 4; ++l) {
+            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
+        }
+        tmp += sum;
+#endif
+
+    }
+
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + step;
+        const uint8_t * ql = x[i].ql + step;
+        const uint8_t * qh = x[i].qh + step;
+        const int8_t  * s  = x[i].scales;
+
+        const float d = x[i+0].d;
+
+        float sum = 0;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
+                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
+                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
+                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
+        }
+        tmp += sum;
+
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const sycl::half *x = (const sycl::half *)vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+
+static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const float * x = (const float *) vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+
+static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                   item_ct1.get_local_id(2);
+
+    if (ix >= kx_padded) {
+        return;
+    }
+
+    const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                   item_ct1.get_local_id(1);
+
+    const int i_padded = iy*kx_padded + ix;
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int ib = i_padded / QK8_1; // block index
+    const int iqs = i_padded % QK8_1; // quant index
+
+    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
+    float amax = sycl::fabs((float)xi);
+    float sum = xi;
+
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
+                                    item_ct1.get_sub_group(), amax, mask));
+        sum +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
+    }
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
+    reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
+}
+
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void k_get_rows(
+            const void * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                     item_ct1.get_local_id(2)) *
+                    2;
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int ib = i00/qk; // block index
+    const int iqs = (i00%qk)/qr; // quant index
+    const int iybs = i00 - i00%qk; // dst block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(src0_row, ib, iqs, v);
+
+    dst_row[iybs + iqs + 0] = v.x();
+    dst_row[iybs + iqs + y_offset] = v.y();
+}
+
+template<typename src0_t, typename dst_t>
+static void k_get_rows_float(
+            const src0_t * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                    item_ct1.get_local_id(2);
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
+
+    dst_row[i00] = src0_row[i00];
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
+                             const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  2 * item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0] = v.x();
+    y[iybs + iqs + y_offset] = v.y();
+}
+
+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
+
+#define VDR_Q4_0_Q8_1_MMVQ 2
+#define VDR_Q4_0_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
+                                                    const float &d4,
+                                                    const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
+        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
+    }
+
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+
+    // second part effectively subtracts 8 from each quant value
+    return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_1_Q8_1_MMVQ 2
+#define VDR_Q4_1_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
+                                                    const sycl::half2 &dm4,
+                                                    const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
+        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
+    const float d4d8 = tmp.x;
+    const float m4s8 = tmp.y;
+#else
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const float d4d8 = dm4f.x() * ds8f.x();
+    const float m4s8 = dm4f.y() * ds8f.y();
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
+    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_0_Q8_1_MMVQ 2
+#define VDR_Q5_0_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
+                       const float &d5, const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = dpct::dp4a(vi0, u[2 * i + 0],
+                          sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = dpct::dp4a(vi1, u[2 * i + 1],
+                          sumi); // SIMD dot product of quantized values
+    }
+
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+
+    // second part effectively subtracts 16 from each quant value
+    return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y());
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_1_Q8_1_MMVQ 2
+#define VDR_Q5_1_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
+                       const sycl::half2 &dm5, const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = dpct::dp4a(vi0, u[2 * i + 0],
+                          sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = dpct::dp4a(vi1, u[2 * i + 1],
+                          sumi); // SIMD dot product of quantized values
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
+    const float d5d8 = tmp.x;
+    const float m5s8 = tmp.y;
+#else
+    const sycl::float2 dm5f =
+        dm5.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const float d5d8 = dm5f.x() * ds8f.x();
+    const float m5s8 = dm5f.y() * ds8f.y();
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
+    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q8_0_Q8_1_MMVQ 2
+#define VDR_Q8_0_Q8_1_MMQ 8
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
+                                                    const float &d8_0,
+                                                    const float &d8_1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = dpct::dp4a(v[i], u[i], sumi);
+    }
+
+    return d8_0*d8_1 * sumi;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
+                                                    const sycl::half2 &dm8,
+                                                    const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = dpct::dp4a(v[i], u[i], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
+    const float d8d8 = tmp.x;
+    const float m8s8 = tmp.y;
+#else
+    const sycl::float2 dm8f =
+        dm8.convert<float, sycl::rounding_mode::automatic>();
+    const sycl::float2 ds8f =
+        ds8.convert<float, sycl::rounding_mode::automatic>();
+    const float d8d8 = dm8f.x() * ds8f.x();
+    const float m8s8 = dm8f.y() * ds8f.y();
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
+    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q2_K_Q8_1_MMVQ 1
+#define VDR_Q2_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
+    const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
+    const sycl::half2 &dm2, const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = scales[2*i];
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+
+        sumf_d +=
+            d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+        sumf_m += d8[i] *
+                  dpct::dp4a(
+                      m, u[i],
+                      0); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    const sycl::float2 dm2f =
+        dm2.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm2f.x() * sumf_d - dm2f.y() * sumf_m;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const uint8_t *__restrict__ scales,
+                           const sycl::half2 &dm2, const float &d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi_d = 0;
+    int sumi_m = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
+        int sumi_d_sc = 0;
+
+        const int sc = scales[i0 / (QI8_1/2)];
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
+            sumi_m = dpct::dp4a(m, u[i],
+                                sumi_m); // multiply sum of q8_1 values with m
+        }
+
+        sumi_d += sumi_d_sc * (sc & 0xF);
+    }
+
+    const sycl::float2 dm2f =
+        dm2.convert<float, sycl::rounding_mode::automatic>();
+
+    return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m);
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q3_K_Q8_1_MMVQ 1
+#define VDR_Q3_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
+    const int &vl, const int &vh, const int *__restrict__ u,
+    const uint8_t *__restrict__ scales, const int &scale_offset,
+    const float &d3, const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi =
+            dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat());
+
+        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d3 * sumf;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const int8_t *__restrict__ scales, const float &d3,
+                           const float &d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
+        int sumi_sc = 0;
+
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+        }
+
+        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
+    }
+
+    return d3*d8 * sumi;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_K_Q8_1_MMVQ 2
+#define VDR_Q4_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K; ++i) {
+        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
+        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int dot1 =
+            dpct::dp4a(v1i, u[2 * i + 1],
+                       dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product
+        const int dot2 =
+            dpct::dp4a(0x01010101, u[2 * i + 1],
+                       dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F,
+                                u[i * QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const sycl::float2 ds8f =
+            ds8[i].convert<float, sycl::rounding_mode::automatic>();
+
+        sumf_d += ds8f.x() * (sc[i] * sumi_d);
+        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_K_Q8_1_MMVQ 2
+#define VDR_Q5_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
+    const int *__restrict__ vl, const int *__restrict__ vh,
+    const int *__restrict__ u, const uint8_t *__restrict__ sc,
+    const uint8_t *__restrict__ m, const sycl::half2 &dm5,
+    const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
+        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
+        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
+
+        const int v0i = vl0i | vh0i;
+        const int v1i = vl1i | vh1i;
+
+        const int dot1 =
+            dpct::dp4a(v0i, u[2 * i + 0],
+                       dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product
+        const int dot2 =
+            dpct::dp4a(0x01010101, u[2 * i + 0],
+                       dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);
+
+    }
+
+    const sycl::float2 dm5f =
+        dm5.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm5f.x() * sumf_d - dm5f.y() * sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j],
+                                sumi_d); // SIMD dot product
+        }
+
+        const sycl::float2 ds8f =
+            ds8[i].convert<float, sycl::rounding_mode::automatic>();
+
+        sumf_d += ds8f.x() * (sc[i] * sumi_d);
+        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const sycl::float2 dm4f =
+        dm4.convert<float, sycl::rounding_mode::automatic>();
+
+    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q6_K_Q8_1_MMVQ 1
+#define VDR_Q6_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
+                            const int *__restrict__ u,
+                            const int8_t *__restrict__ scales, const float &d,
+                            const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = scales[4*i];
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
+
+        const int vi = dpct::vectorized_binary<sycl::char4>(
+            (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
+
+        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const int8_t *__restrict__ sc, const float &d6,
+                           const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
+        sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
+
+#pragma unroll
+        for (int i = i0; i < i0 + 2; ++i) {
+            sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0],
+                                    sumi_d.x()); // SIMD dot product
+            sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1],
+                                    sumi_d.x()); // SIMD dot product
+
+            sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4],
+                                    sumi_d.y()); // SIMD dot product
+            sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5],
+                                    sumi_d.y()); // SIMD dot product
+        }
+
+        sumf_d += d8[i0 / 4] *
+                  (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y());
+    }
+
+    return d6 * sumf_d;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __dpct_inline__ float
+vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+
+    int v[VDR_Q4_0_Q8_1_MMVQ];
+    int u[2*VDR_Q4_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
+        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs_q4_0, float *tile_x_d_q4_0) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs_q4_0;
+    *x_dm = (sycl::half2 *)tile_x_d_q4_0;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_0;
+    const int kqsx = k % QI4_0;
+
+    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
+
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const float * x_dmf = (const float *) x_dm;
+
+    int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __dpct_inline__ float
+vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    int v[VDR_Q4_1_Q8_1_MMVQ];
+    int u[2*VDR_Q4_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
+        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs_q4_1, sycl::half2 *tile_x_dm_q4_1) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs_q4_1;
+    *x_dm = tile_x_dm_q4_1;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_1;
+    const int kqsx = k % QI4_1;
+
+    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+
+    int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+
+    int vl[VDR_Q5_0_Q8_1_MMVQ];
+    int vh[VDR_Q5_0_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
+        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
+        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
+    }
+
+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q5_0, float *tile_x_d_q5_0) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_ql_q5_0;
+    *x_dm = (sycl::half2 *)tile_x_d_q5_0;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_0;
+    const int kqsx = k % QI5_0;
+
+    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
+
+        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
+        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
+        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
+        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
+        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
+        qs0 = dpct::vectorized_binary<sycl::char4>(
+            qs0, 0x10101010, dpct::sub_sat()); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
+        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
+        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
+        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
+        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
+        qs1 = dpct::vectorized_binary<sycl::char4>(
+            qs1, 0x10101010, dpct::sub_sat()); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    int u[2*VDR_Q5_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    int vl[VDR_Q5_1_Q8_1_MMVQ];
+    int vh[VDR_Q5_1_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
+        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
+        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
+    }
+
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q5_1, sycl::half2 *tile_x_dm_q5_1) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_ql_q5_1;
+    *x_dm = tile_x_dm_q5_1;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset < nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_1;
+    const int kqsx = k % QI5_1;
+
+    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
+
+        int qs0 = (ql >>  0) & 0x0F0F0F0F;
+        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
+        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
+        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4) & 0x0F0F0F0F;
+        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
+
+    int u[2*VDR_Q5_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __dpct_inline__ float
+vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    int v[VDR_Q8_0_Q8_1_MMVQ];
+    int u[VDR_Q8_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
+        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
+        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+    }
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
+                                                      bq8_1->ds[0]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs_q8_0, float *tile_x_d_q8_0) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs_q8_0;
+    *x_dm = (sycl::half2 *)tile_x_d_q8_0;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI8_0;
+    const int kqsx = k % QI8_0;
+    float * x_dmf = (float *) x_dm;
+
+    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
+         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const uint8_t * scales = bq2_K->scales + scale_offset;
+
+    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
+    int    u[QR2_K];
+    float d8[QR2_K];
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++ i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + i].ds[0];
+    }
+
+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q2_K, sycl::half2 *tile_x_dm_q2_K,
+                    int *tile_x_sc_q2_K) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql_q2_K;
+    *x_dm = tile_x_dm_q2_K;
+    *x_sc = tile_x_sc_q2_K;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI2_K;
+    const int kqsx = k % QI2_K;
+
+    const block_q2_K * bx0 = (const block_q2_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
+    }
+}
+
+static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const int kbx = k / QI2_K;
+    const int ky  = (k % QI2_K) * QR2_K;
+    const float * y_df = (const float *) y_ds;
+
+    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
+
+    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
+
+#pragma unroll
+    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
+        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
+    }
+
+    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+
+    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
+    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const float d = bq3_K->d;
+
+    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
+
+    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
+
+    int    u[QR3_K];
+    float d8[QR3_K];
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + i].ds[0];
+    }
+
+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q3_K, sycl::half2 *tile_x_dm_q3_K,
+                    int *tile_x_qh_q3_K, int *tile_x_sc_q3_K) {
+
+    *x_ql = tile_x_ql_q3_K;
+    *x_dm = tile_x_dm_q3_K;
+    *x_qh = tile_x_qh_q3_K;
+    *x_sc = tile_x_sc_q3_K;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI3_K;
+    const int kqsx = k % QI3_K;
+
+    const block_q3_K * bx0 = (const block_q3_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
+        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
+
+        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
+
+        const int ksc = k % (QI3_K/4);
+
+        const int ksc_low = ksc % (QI3_K/8);
+        const int shift_low = 4 * (ksc / (QI3_K/8));
+        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+        const int ksc_high = QI3_K/8;
+        const int shift_high = 2 * ksc;
+        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+        const int sc = dpct::vectorized_binary<sycl::char4>(
+            sc_low | sc_high, 0x20202020, dpct::sub_sat());
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+
+    const int kbx  = k / QI3_K;
+    const int ky  = (k % QI3_K) * QR3_K;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
+
+    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
+        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+        const int shift = 2 * ((ky % 32) / 8);
+        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
+
+        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+        const int vlh = (vh << 2) & 0x04040404;
+
+        v[l] = dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat());
+    }
+
+    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
+    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+#ifndef GGML_QKK_64
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    int    v[2];
+    int    u[2*QR4_K];
+    float d8[QR4_K];
+
+    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
+    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
+
+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
+    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
+    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
+    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
+
+    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    v[0] = q4[0];
+    v[1] = q4[4];
+
+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = bq8i->ds[0];
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    const uint16_t * a = (const uint16_t *)bq4_K->scales;
+    aux16[0] = a[0] & 0x0f0f;
+    aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+    const float dall = bq4_K->dm[0];
+    const float dmin = bq4_K->dm[1];
+
+    const float d8_1 = __low2float(bq8_1[0].ds);
+    const float d8_2 = __low2float(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
+    const int v1 = q4[0];
+    const int v2 = q4[4];
+
+    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
+    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
+    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
+    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
+
+    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
+    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
+
+    return dall * sumf_d - dmin * sumf_m;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q4_K, sycl::half2 *tile_x_dm_q4_K,
+                    int *tile_x_sc_q4_K) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql_q4_K;
+    *x_dm = tile_x_dm_q4_K;
+    *x_sc = tile_x_sc_q4_K;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI4_K; // == k if QK_K == 256
+
+    const block_q4_K * bx0 = (const block_q4_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+#else
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
+
+    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+#ifndef GGML_QKK_64
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    int   vl[2];
+    int   vh[2];
+    int    u[2*QR5_K];
+    float d8[QR5_K];
+
+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
+    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
+
+    vl[0] = ql[0];
+    vl[1] = ql[4];
+
+    vh[0] = qh[0] >> bq8_offset;
+    vh[1] = qh[4] >> bq8_offset;
+
+    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = bq8i->ds[0];
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    const int8_t * s = bq5_K->scales;
+
+    const float d = bq5_K->d;
+
+    const float d8_1 = __low2half(bq8_1[0].ds);
+    const float d8_2 = __low2half(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * ql = (const int *)bq5_K->qs + (iqs/2);
+    const int vl1 = ql[0];
+    const int vl2 = ql[4];
+
+    const int step = 4 * (iqs/2); // 0, 4, 8, 12
+    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
+    const int in = step%8; // 0, 4, 0, 4
+    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
+
+    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
+    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
+    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
+    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
+
+    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
+                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
+
+    return d * sumf_d;
+
+#else
+    bad_arch();
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql_q5_K, sycl::half2 *tile_x_dm_q5_K,
+                    int *tile_x_sc_q5_K) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql_q5_K;
+    *x_dm = tile_x_dm_q5_K;
+    *x_sc = tile_x_sc_q5_K;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI5_K; // == k if QK_K == 256
+
+    const block_q5_K * bx0 = (const block_q5_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR5_K*kqsx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
+        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
+
+    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
+    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
+    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
+    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
+
+    const int8_t * scales = bq6_K->scales + scale_offset;
+
+    int    u[QR6_K];
+    float d8[QR6_K];
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + 2 * i].ds[0];
+    }
+
+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI6_K; // == k if QK_K == 256
+
+    const block_q6_K * bx0 = (const block_q6_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR6_K*kqsx;
+
+        const int ql = get_int_from_uint8(bxi->ql, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
+        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
+        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
+
+        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
+        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
+
+        x_ql[i * (2 * WARP_SIZE + 1) + kq0] =
+            dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020,
+                                                 dpct::sub_sat());
+        x_ql[i * (2 * WARP_SIZE + 1) + kq1] =
+            dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020,
+                                                 dpct::sub_sat());
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
+    }
+}
+
+static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
+
+    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
+    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
+    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+}
+
+template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
+          int mmq_y, int nwarps, load_tiles_cuda_t load_tiles, int vdr,
+          vec_dot_q_mul_mat_cuda_t vec_dot>
+/*
+DPCT1110:8: The total declared local variable size in device function mul_mat_q
+exceeds 128 bytes and may cause high register pressure. Consult with your
+hardware vendor to find the total register size available and adjust the code,
+or use smaller sub-group size to avoid high register pressure.
+*/
+static __dpct_inline__ void
+mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
+          float *__restrict__ dst, const int ncols_x, const int nrows_x,
+          const int ncols_y, const int nrows_y, const int nrows_dst,
+          int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh,
+          int *tile_x_sc, const sycl::nd_item<3> &item_ct1, int *tile_y_qs,
+          sycl::half2 *tile_y_ds) {
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    const int blocks_per_row_x = ncols_x / qk;
+    const int blocks_per_col_y = nrows_y / QK8_1;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+    const int & ncols_dst = ncols_y;
+
+    const int row_dst_0 = item_ct1.get_group(2) * mmq_y;
+    const int & row_x_0 = row_dst_0;
+
+    const int col_dst_0 = item_ct1.get_group(1) * mmq_x;
+    const int & col_y_0 = col_dst_0;
+
+    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
+
+    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+
+        load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
+                   tile_x_qh, tile_x_sc, item_ct1.get_local_id(1),
+                   nrows_x - row_x_0 - 1, item_ct1.get_local_id(2),
+                   blocks_per_row_x);
+
+#pragma unroll
+        for (int ir = 0; ir < qr; ++ir) {
+            const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2);
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = dpct::min(
+                    (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i),
+                    ncols_y - 1); // to prevent out-of-bounds memory accesses
+
+                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
+
+                const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE +
+                                    kqs % WARP_SIZE;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(
+                    by0->qs, item_ct1.get_local_id(2) % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids =
+                    (ids0 + item_ct1.get_local_id(1) * QI8_1 +
+                     item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) %
+                    mmq_x;
+                const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1);
+                const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const sycl::half2 *dsi_src =
+                    &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) +
+                       ir * (WARP_SIZE / QI8_1) + kby]
+                         .ds;
+                sycl::half2 *dsi_dst =
+                    &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = (*dsi_src)[0];
+                }
+            }
+
+            /*
+            DPCT1118:9: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:56: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
+#pragma unroll
+                for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+                        sum[i / WARP_SIZE][j / nwarps] += vec_dot(
+                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                            tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i,
+                            item_ct1.get_local_id(1) + j, k);
+                    }
+                }
+            }
+
+            /*
+            DPCT1118:10: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:57: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+        }
+    }
+
+#pragma unroll
+    for (int j = 0; j < mmq_x; j += nwarps) {
+        const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1);
+
+        if (col_dst >= ncols_dst) {
+            return;
+        }
+
+#pragma unroll
+        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+            const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i;
+
+            if (row_dst >= nrows_dst) {
+                continue;
+            }
+
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
+        }
+    }
+}
+
+#define  MMQ_X_Q4_0_RDNA2  64
+#define  MMQ_Y_Q4_0_RDNA2  128
+#define NWARPS_Q4_0_RDNA2  8
+#define  MMQ_X_Q4_0_RDNA1  64
+#define  MMQ_Y_Q4_0_RDNA1  64
+#define NWARPS_Q4_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_0_AMPERE 4
+#define  MMQ_Y_Q4_0_AMPERE 32
+#define NWARPS_Q4_0_AMPERE 4
+#else
+#define  MMQ_X_Q4_0_AMPERE 64
+#define  MMQ_Y_Q4_0_AMPERE 128
+#define NWARPS_Q4_0_AMPERE 4
+#endif
+#define  MMQ_X_Q4_0_PASCAL 64
+#define  MMQ_Y_Q4_0_PASCAL 64
+#define NWARPS_Q4_0_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q4_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+    const int nwarps = NWARPS_Q4_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+    const int nwarps = NWARPS_Q4_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+    const int nwarps = NWARPS_Q4_0_AMPERE;
+    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_qs_q4_0, tile_x_d_q4_0);
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
+              load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ,
+              vec_dot_q4_0_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+    const int nwarps = NWARPS_Q4_0_PASCAL;
+
+    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q4_0_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_1_RDNA2  64
+#define  MMQ_Y_Q4_1_RDNA2  128
+#define NWARPS_Q4_1_RDNA2  8
+#define  MMQ_X_Q4_1_RDNA1  64
+#define  MMQ_Y_Q4_1_RDNA1  64
+#define NWARPS_Q4_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_1_AMPERE 4
+#define  MMQ_Y_Q4_1_AMPERE 32
+#define NWARPS_Q4_1_AMPERE 4
+#else
+#define  MMQ_X_Q4_1_AMPERE 64
+#define  MMQ_Y_Q4_1_AMPERE 128
+#define NWARPS_Q4_1_AMPERE 4
+#endif
+#define  MMQ_X_Q4_1_PASCAL 64
+#define  MMQ_Y_Q4_1_PASCAL 64
+#define NWARPS_Q4_1_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1,
+    sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+    const int nwarps = NWARPS_Q4_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+    const int nwarps = NWARPS_Q4_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+    const int nwarps = NWARPS_Q4_1_AMPERE;
+    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_qs_q4_1, tile_x_dm_q4_1);
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
+              load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ,
+              vec_dot_q4_1_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+    const int nwarps = NWARPS_Q4_1_PASCAL;
+    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q4_1_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_0_RDNA2  64
+#define  MMQ_Y_Q5_0_RDNA2  128
+#define NWARPS_Q5_0_RDNA2  8
+#define  MMQ_X_Q5_0_RDNA1  64
+#define  MMQ_Y_Q5_0_RDNA1  64
+#define NWARPS_Q5_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_0_AMPERE 4
+#define  MMQ_Y_Q5_0_AMPERE 32
+#define NWARPS_Q5_0_AMPERE 4
+#else
+#define  MMQ_X_Q5_0_AMPERE 128
+#define  MMQ_Y_Q5_0_AMPERE 64
+#define NWARPS_Q5_0_AMPERE 4
+#endif
+#define  MMQ_X_Q5_0_PASCAL 64
+#define  MMQ_Y_Q5_0_PASCAL 64
+#define NWARPS_Q5_0_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q5_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+    const int nwarps = NWARPS_Q5_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+    const int nwarps = NWARPS_Q5_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+    const int nwarps = NWARPS_Q5_0_AMPERE;
+    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q5_0, tile_x_d_q5_0);
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
+              load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ,
+              vec_dot_q5_0_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+    const int nwarps = NWARPS_Q5_0_PASCAL;
+    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q5_0_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_1_RDNA2  64
+#define  MMQ_Y_Q5_1_RDNA2  128
+#define NWARPS_Q5_1_RDNA2  8
+#define  MMQ_X_Q5_1_RDNA1  64
+#define  MMQ_Y_Q5_1_RDNA1  64
+#define NWARPS_Q5_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_1_AMPERE 4
+#define  MMQ_Y_Q5_1_AMPERE 32
+#define NWARPS_Q5_1_AMPERE 4
+#else
+#define  MMQ_X_Q5_1_AMPERE 128
+#define  MMQ_Y_Q5_1_AMPERE 64
+#define NWARPS_Q5_1_AMPERE 4
+#endif
+#define  MMQ_X_Q5_1_PASCAL 64
+#define  MMQ_Y_Q5_1_PASCAL 64
+#define NWARPS_Q5_1_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1,
+    sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+    const int nwarps = NWARPS_Q5_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+    const int nwarps = NWARPS_Q5_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+    const int nwarps = NWARPS_Q5_1_AMPERE;
+    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q5_1, tile_x_dm_q5_1);
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
+              load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ,
+              vec_dot_q5_1_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+    const int nwarps = NWARPS_Q5_1_PASCAL;
+    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q5_1_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q8_0_RDNA2  64
+#define  MMQ_Y_Q8_0_RDNA2  128
+#define NWARPS_Q8_0_RDNA2  8
+#define  MMQ_X_Q8_0_RDNA1  64
+#define  MMQ_Y_Q8_0_RDNA1  64
+#define NWARPS_Q8_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q8_0_AMPERE 4
+#define  MMQ_Y_Q8_0_AMPERE 32
+#define NWARPS_Q8_0_AMPERE 4
+#else
+#define  MMQ_X_Q8_0_AMPERE 128
+#define  MMQ_Y_Q8_0_AMPERE 64
+#define NWARPS_Q8_0_AMPERE 4
+#endif
+#define  MMQ_X_Q8_0_PASCAL 64
+#define  MMQ_Y_Q8_0_PASCAL 64
+#define NWARPS_Q8_0_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q8_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+    const int nwarps = NWARPS_Q8_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+    const int nwarps = NWARPS_Q8_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+    const int nwarps = NWARPS_Q8_0_AMPERE;
+    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_qs_q8_0, tile_x_d_q8_0);
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
+              load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ,
+              vec_dot_q8_0_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+    const int nwarps = NWARPS_Q8_0_PASCAL;
+    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q8_0_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q2_K_RDNA2  64
+#define  MMQ_Y_Q2_K_RDNA2  128
+#define NWARPS_Q2_K_RDNA2  8
+#define  MMQ_X_Q2_K_RDNA1  128
+#define  MMQ_Y_Q2_K_RDNA1  32
+#define NWARPS_Q2_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q2_K_AMPERE 4
+#define  MMQ_Y_Q2_K_AMPERE 32
+#define NWARPS_Q2_K_AMPERE 4
+#else
+#define  MMQ_X_Q2_K_AMPERE 64
+#define  MMQ_Y_Q2_K_AMPERE 128
+#define NWARPS_Q2_K_AMPERE 4
+#endif
+#define  MMQ_X_Q2_K_PASCAL 64
+#define  MMQ_Y_Q2_K_PASCAL 64
+#define NWARPS_Q2_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q2_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K,
+    sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs,
+    sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+    const int nwarps = NWARPS_Q2_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+    const int nwarps = NWARPS_Q2_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+    const int nwarps = NWARPS_Q2_K_AMPERE;
+    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K);
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ,
+              vec_dot_q2_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+    const int nwarps = NWARPS_Q2_K_PASCAL;
+    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q2_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q3_K_RDNA2  128
+#define  MMQ_Y_Q3_K_RDNA2  64
+#define NWARPS_Q3_K_RDNA2  8
+#define  MMQ_X_Q3_K_RDNA1  32
+#define  MMQ_Y_Q3_K_RDNA1  128
+#define NWARPS_Q3_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q3_K_AMPERE 4
+#define  MMQ_Y_Q3_K_AMPERE 32
+#define NWARPS_Q3_K_AMPERE 4
+#else
+#define  MMQ_X_Q3_K_AMPERE 128
+#define  MMQ_Y_Q3_K_AMPERE 128
+#define NWARPS_Q3_K_AMPERE 4
+#endif
+#define  MMQ_X_Q3_K_PASCAL 64
+#define  MMQ_Y_Q3_K_PASCAL 64
+#define NWARPS_Q3_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q3_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K,
+    sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+    const int nwarps = NWARPS_Q3_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+    const int nwarps = NWARPS_Q3_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+    const int nwarps = NWARPS_Q3_K_AMPERE;
+    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K,
+                               tile_x_sc_q3_K);
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ,
+              vec_dot_q3_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+    const int nwarps = NWARPS_Q3_K_PASCAL;
+    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q3_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_K_RDNA2  64
+#define  MMQ_Y_Q4_K_RDNA2  128
+#define NWARPS_Q4_K_RDNA2  8
+#define  MMQ_X_Q4_K_RDNA1  32
+#define  MMQ_Y_Q4_K_RDNA1  64
+#define NWARPS_Q4_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_K_AMPERE 4
+#define  MMQ_Y_Q4_K_AMPERE 32
+#define NWARPS_Q4_K_AMPERE 4
+#else
+#define  MMQ_X_Q4_K_AMPERE 64
+#define  MMQ_Y_Q4_K_AMPERE 128
+#define NWARPS_Q4_K_AMPERE 4
+#endif
+#define  MMQ_X_Q4_K_PASCAL 64
+#define  MMQ_Y_Q4_K_PASCAL 64
+#define NWARPS_Q4_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K,
+    sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs,
+    sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+    const int nwarps = NWARPS_Q4_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+    const int nwarps = NWARPS_Q4_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+    const int nwarps = NWARPS_Q4_K_AMPERE;
+    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K);
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ,
+              vec_dot_q4_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+    const int nwarps = NWARPS_Q4_K_PASCAL;
+    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q4_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_K_RDNA2  64
+#define  MMQ_Y_Q5_K_RDNA2  128
+#define NWARPS_Q5_K_RDNA2  8
+#define  MMQ_X_Q5_K_RDNA1  32
+#define  MMQ_Y_Q5_K_RDNA1  64
+#define NWARPS_Q5_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_K_AMPERE 4
+#define  MMQ_Y_Q5_K_AMPERE 32
+#define NWARPS_Q5_K_AMPERE 4
+#else
+#define  MMQ_X_Q5_K_AMPERE 64
+#define  MMQ_Y_Q5_K_AMPERE 128
+#define NWARPS_Q5_K_AMPERE 4
+#endif
+#define  MMQ_X_Q5_K_PASCAL 64
+#define  MMQ_Y_Q5_K_PASCAL 64
+#define NWARPS_Q5_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K,
+    sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs,
+    sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+    const int nwarps = NWARPS_Q5_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+    const int nwarps = NWARPS_Q5_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+    const int nwarps = NWARPS_Q5_K_AMPERE;
+    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K);
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ,
+              vec_dot_q5_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+    const int nwarps = NWARPS_Q5_K_PASCAL;
+    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q5_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q6_K_RDNA2  64
+#define  MMQ_Y_Q6_K_RDNA2  128
+#define NWARPS_Q6_K_RDNA2  8
+#define  MMQ_X_Q6_K_RDNA1  32
+#define  MMQ_Y_Q6_K_RDNA1  64
+#define NWARPS_Q6_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q6_K_AMPERE 4
+#define  MMQ_Y_Q6_K_AMPERE 32
+#define NWARPS_Q6_K_AMPERE 4
+#else
+#define  MMQ_X_Q6_K_AMPERE 64
+#define  MMQ_Y_Q6_K_AMPERE 64
+#define NWARPS_Q6_K_AMPERE 4
+#endif
+#define  MMQ_X_Q6_K_PASCAL 64
+#define  MMQ_Y_Q6_K_PASCAL 64
+#define NWARPS_Q6_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q6_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
+    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+    const int nwarps = NWARPS_Q6_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+    const int nwarps = NWARPS_Q6_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+    const int nwarps = NWARPS_Q6_K_AMPERE;
+    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql, tile_x_dm, tile_x_sc);
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
+              load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ,
+              vec_dot_q6_K_q8_1_mul_mat>(
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+    const int nwarps = NWARPS_Q6_K_PASCAL;
+    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
+#else
+    (void) vec_dot_q6_K_q8_1_mul_mat;
+    bad_arch();
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row * blocks_per_row + i +
+                        item_ct1.get_local_id(2) / (qi / vdr); // x block index
+
+        const int iby = (i + item_ct1.get_local_id(2) / (qi / vdr)) *
+                        (qk / QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
+                                   const sycl::nd_item<3> &item_ct1) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int tid = item_ct1.get_local_id(2);
+
+    const int iter_stride = 2*GGML_CUDA_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+// partial sum for each thread
+#ifdef GGML_CUDA_F16
+    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
+#else
+    float tmp = 0.0f;
+#endif // GGML_CUDA_F16
+
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel(vx, ib, iqs + j/qr, v);
+
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_CUDA_F16
+            tmp += __hmul2(v, {
+                y[iybs + iqs + j/qr + 0],
+                y[iybs + iqs + j/qr + y_offset]
+            });
+#else
+            tmp += v.x() * y[iybs + iqs + j / qr + 0];
+            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
+#endif // GGML_CUDA_F16
+        }
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+#ifdef GGML_CUDA_F16
+        dst[row] = tmp.x + tmp.y;
+#else
+        dst[row] = tmp;
+#endif // GGML_CUDA_F16
+    }
+}
+
+static void mul_mat_p021_f16_f32(
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
+    const sycl::nd_item<3> &item_ct1) {
+
+    const sycl::half *x = (const sycl::half *)vx;
+
+    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                      item_ct1.get_local_id(1);
+    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                        item_ct1.get_local_id(0);
+    const int channel_x = channel / (nchannels_y / nchannels_x);
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x;
+         col_x0 += item_ct1.get_local_range(2)) {
+        const int col_x = col_x0 + item_ct1.get_local_id(2);
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        // x is transposed and permuted
+        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
+        const float xi =
+            sycl::vec<sycl::half, 1>(x[ix])
+                .convert<float, sycl::rounding_mode::automatic>()[0];
+
+        const int row_y = col_x;
+
+
+        // y is not transposed but permuted
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // dst is not transposed and not permuted
+    const int idst = channel*nrows_dst + row_dst;
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
+    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
+    const sycl::nd_item<3> &item_ct1) {
+
+    const sycl::half *x = (const sycl::half *)vx;
+
+    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                      item_ct1.get_local_id(1);
+    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                        item_ct1.get_local_id(0);
+    const int channel_x = channel / channel_x_divisor;
+
+    const int nrows_y   = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst   = row_x;
+
+    const int idst = channel*nrows_dst + row_dst;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x;
+         col_x0 += item_ct1.get_local_range(2)) {
+        const int col_x = col_x0 + item_ct1.get_local_id(2);
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        const int row_y = col_x;
+
+        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
+        const int iy = channel*nrows_y + row_y;
+
+        const float xi =
+            sycl::vec<sycl::half, 1>(x[ix])
+                .convert<float, sycl::rounding_mode::automatic>()[0];
+
+        tmp += xi * y[iy];
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    sycl::half *dsti = (sycl::half *)cdsti;
+
+    *dsti = sycl::vec<float, 1>(*xi)
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+}
+
+static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
+    const sycl::half *xi = (const sycl::half *)cxi;
+    sycl::half *dsti = (sycl::half *)cdsti;
+
+    *dsti = *xi;
+}
+
+template <cpy_kernel_t cpy_1>
+static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
+                                   const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= ne) {
+        return;
+    }
+
+    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+    // then combine those indices with the corresponding byte offsets to get the total offsets
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = i - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = i - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q8_0 * dsti = (block_q8_0 *) cdsti;
+
+    float amax = 0.0f; // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+        const float v = xi[j];
+        amax = sycl::fmax(amax, sycl::fabs((float)v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK8_0; ++j) {
+        const float x0 = xi[j]*id;
+
+        dsti->qs[j] = sycl::round((float)x0);
+    }
+}
+
+static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_0 * dsti = (block_q4_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_0; ++j) {
+        const float v = xi[j];
+        if (amax < sycl::fabs((float)v)) {
+            amax = sycl::fabs((float)v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK4_0/2; ++j) {
+        const float x0 = xi[0       + j]*id;
+        const float x1 = xi[QK4_0/2 + j]*id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_1 * dsti = (block_q4_1 *) cdsti;
+
+    float vmin = FLT_MAX;
+    float vmax = -FLT_MAX;
+
+    for (int j = 0; j < QK4_1; ++j) {
+        const float v = xi[j];
+
+        if (v < vmin) vmin = v;
+        if (v > vmax) vmax = v;
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->dm.x() = d;
+    dsti->dm.y() = vmin;
+
+    for (int j = 0; j < QK4_1/2; ++j) {
+        const float x0 = (xi[0       + j] - vmin)*id;
+        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+template <cpy_kernel_t cpy_blck, int qk>
+static void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                 const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
+                                 const sycl::nd_item<3> &item_ct1) {
+    const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                   item_ct1.get_local_id(2)) *
+                  qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = (i - i02*ne01*ne00 - i01*ne00);
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_blck(cx + x_offset, cdst + dst_offset);
+}
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
+    return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
+}
+
+struct rope_corr_dims {
+    float v[4];
+};
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
+    }
+    *cos_theta = sycl::cos(theta) * mscale;
+    *sin_theta = sycl::sin(theta) * mscale;
+}
+
+// rope == RoPE == rotary positional embedding
+template<typename T, bool has_pos>
+static void rope(
+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims
+,
+    const sycl::nd_item<3> &item_ct1) {
+    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                         item_ct1.get_local_id(1));
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + 1];
+
+    dst[i + 0] = x0*cos_theta - x1*sin_theta;
+    dst[i + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+template<typename T, bool has_pos>
+static void rope_neox(
+    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
+,
+    const sycl::nd_item<3> &item_ct1) {
+    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                         item_ct1.get_local_id(1));
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int ib = col / n_dims;
+    const int ic = col % n_dims;
+
+    if (ib > 0) {
+        const int i = row*ncols + ib*n_dims + ic;
+
+        dst[i + 0] = x[i + 0];
+        dst[i + 1] = x[i + 1];
+
+        return;
+    }
+
+    const int i  = row*ncols + ib*n_dims + ic/2;
+    const int i2 = row/p_delta_rows;
+
+    float cur_rot = inv_ndims * ic - ib;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base =
+        p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + n_dims/2];
+
+    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+static void rope_glm_f32(
+    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    int n_ctx
+, const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int half_n_dims = ncols/4;
+
+    if (col >= half_n_dims) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
+     // FIXME: this is likely wrong
+    const int p = pos != nullptr ? pos[i2] : 0;
+
+    const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
+    const float sin_theta = sycl::sin((float)theta);
+    const float cos_theta = sycl::cos((float)theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + half_n_dims];
+
+    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
+    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
+
+    const float block_theta =
+        ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
+    const float sin_block_theta = sycl::sin((float)block_theta);
+    const float cos_block_theta = sycl::cos((float)block_theta);
+
+    const float x2 = x[i + half_n_dims * 2];
+    const float x3 = x[i + half_n_dims * 3];
+
+    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
+    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
+}
+
+static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
+                                 const int n_heads_log2_floor, const float m0, const float m1,
+                                 const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i = row*ncols + col;
+
+    const int k = row/k_rows;
+
+    float m_k;
+    if (k < n_heads_log2_floor) {
+        m_k = dpct::pow(m0, k + 1);
+    } else {
+        m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
+    }
+
+    dst[i] = col * m_k + x[i];
+}
+
+static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(1);
+    const int col = item_ct1.get_local_id(2);
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum, item_ct1);
+
+    if (col == 0) {
+        dst[row] = sum;
+    }
+}
+
+template<typename T>
+static inline void swap(T & a, T & b) {
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+template<ggml_sort_order order>
+static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
+                              const sycl::nd_item<3> &item_ct1) {
+    // bitonic sort
+    int col = item_ct1.get_local_id(2);
+    int row = item_ct1.get_group(1);
+
+    if (col >= ncols) return;
+
+    const float * x_row = x + row * ncols;
+    int * dst_row = dst + row * ncols;
+
+    // initialize indices
+    if (col < ncols) {
+        dst_row[col] = col;
+    }
+    /*
+    DPCT1065:58: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+
+    for (int k = 2; k <= ncols; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
+                        swap(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
+                        swap(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            /*
+            DPCT1118:11: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:59: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+        }
+    }
+}
+
+static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
+                              const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int i = row*ncols + col;
+    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
+    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
+}
+
+static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
+                         const sycl::nd_item<3> &item_ct1, float *buf) {
+    const int tid = item_ct1.get_local_id(2);
+    const int rowx = item_ct1.get_group(2);
+    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
+
+    const int block_size = item_ct1.get_local_range(2);
+
+    const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+    const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+
+    float max_val = -INFINITY;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+        max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
+    }
+
+    // find the max value in the block
+    max_val = warp_reduce_max(max_val, item_ct1);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf[lane_id] = -INFINITY;
+        }
+        /*
+        DPCT1118:12: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        if (lane_id == 0) {
+            buf[warp_id] = max_val;
+        }
+        /*
+        DPCT1118:13: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        max_val = buf[lane_id];
+        max_val = warp_reduce_max(max_val, item_ct1);
+    }
+
+    float tmp = 0.f;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+        const float val =
+            sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
+        tmp += val;
+        dst[ix] = val;
+    }
+
+    // find the sum of exps in the block
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf[lane_id] = 0.f;
+        }
+        /*
+        DPCT1118:14: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        if (lane_id == 0) {
+            buf[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:15: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        tmp = buf[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    const float inv_tmp = 1.f / tmp;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int i = rowx*ncols + col;
+        dst[i] *= inv_tmp;
+    }
+}
+
+static void scale_f32(const float * x, float * dst, const float scale, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = scale * x[i];
+}
+
+static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
+}
+
+static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
+                           int IW, int IH, int OW, int KW, int KH,
+                           int pelements, int CHW, int s0, int s1, int p0,
+                           int p1, int d0, int d1,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_id(2) +
+                  item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (i >= pelements) {
+        return;
+    }
+
+    const int ksize = OW * (KH > 1 ? KW : 1);
+    const int kx = i / ksize;
+    const int kd = kx * ksize;
+    const int ky = (i - kd) / OW;
+    const int ix = i % OW;
+
+    const int64_t iiw = ix * s0 + kx * d0 - p0;
+    const int64_t iih = item_ct1.get_group(1) * s1 + ky * d1 - p1;
+
+    const int64_t offset_dst =
+        (item_ct1.get_group(1) * OW + ix) * CHW +
+        (item_ct1.get_group(0) * (KW * KH) + ky * KW + kx);
+
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] =
+            sycl::vec<float, 1>(0.0f)
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+    } else {
+        const int64_t offset_src = item_ct1.get_group(0) * offset_delta;
+        dst[offset_dst] =
+            sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_cuda(const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst, const void *src0_dd,
+                          const int32_t *src1_dd, float *dst_dd,
+                          dpct::queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    GGML_ASSERT(ne00 % 2 == 0);
+
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             k_get_rows<qk, qr, dq>(
+                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+                         });
+
+    (void) dst;
+}
+
+template <typename src0_t>
+static void get_rows_cuda_float(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const src0_t *src0_dd, const int32_t *src1_dd,
+                                float *dst_dd, dpct::queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+            });
+    }
+
+    (void) dst;
+}
+
+template<float (*bin_op)(const float, const float)>
+struct bin_bcast_cuda {
+    template <typename src0_t, typename src1_t, typename dst_t>
+    void operator()(const struct ggml_tensor *src0,
+                    const struct ggml_tensor *src1, struct ggml_tensor *dst,
+                    const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
+                    dpct::queue_ptr stream) {
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        int nr0 = ne10/ne0;
+        int nr1 = ne11/ne1;
+        int nr2 = ne12/ne2;
+        int nr3 = ne13/ne3;
+
+        int nr[4] = { nr0, nr1, nr2, nr3 };
+
+        // collapse dimensions until first broadcast dimension
+        int64_t cne0[] = {ne0, ne1, ne2, ne3};
+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
+        size_t cnb0[] = {nb0, nb1, nb2, nb3};
+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+        auto collapse = [](int64_t cne[]) {
+            cne[0] *= cne[1];
+            cne[1] = cne[2];
+            cne[2] = cne[3];
+            cne[3] = 1;
+        };
+
+        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
+            cnb[1] *= cne[1];
+            cnb[2] *= cne[2];
+            cnb[3] *= cne[3];
+        };
+
+        for (int i = 0; i < 4; i++) {
+            if (nr[i] != 1) {
+                break;
+            }
+            if (i > 0) {
+                collapse_nb(cnb0, cne0);
+                collapse_nb(cnb1, cne1);
+                collapse(cne0);
+                collapse(cne1);
+            }
+        }
+        {
+            int64_t ne0 = cne0[0];
+            int64_t ne1 = cne0[1];
+            int64_t ne2 = cne0[2];
+            int64_t ne3 = cne0[3];
+
+            int64_t ne10 = cne1[0];
+            int64_t ne11 = cne1[1];
+            int64_t ne12 = cne1[2];
+            int64_t ne13 = cne1[3];
+
+            size_t nb0 = cnb0[0];
+            size_t nb1 = cnb0[1];
+            size_t nb2 = cnb0[2];
+            size_t nb3 = cnb0[3];
+
+            size_t nb10 = cnb1[0];
+            size_t nb11 = cnb1[1];
+            size_t nb12 = cnb1[2];
+            size_t nb13 = cnb1[3];
+
+            size_t s0 = nb0 / sizeof(dst_t);
+            size_t s1 = nb1 / sizeof(dst_t);
+            size_t s2 = nb2 / sizeof(dst_t);
+            size_t s3 = nb3 / sizeof(dst_t);
+
+            size_t s10 = nb10 / sizeof(src1_t);
+            size_t s11 = nb11 / sizeof(src1_t);
+            size_t s12 = nb12 / sizeof(src1_t);
+            size_t s13 = nb13 / sizeof(src1_t);
+
+            GGML_ASSERT(s0 == 1);
+            GGML_ASSERT(s10 == 1);
+
+            const int block_size = 128;
+
+            int64_t hne0 = std::max(ne0/2LL, 1LL);
+
+            sycl::range<3> block_dims(1, 1, 1);
+            block_dims[2] = std::min<unsigned int>(hne0, block_size);
+            block_dims[1] = std::min<unsigned int>(
+                ne1, block_size / (unsigned int)block_dims[2]);
+            block_dims[0] = std::min(
+                std::min<unsigned int>(
+                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
+                                   (unsigned int)block_dims[1]),
+                64U);
+
+            sycl::range<3> block_nums(
+                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
+                (ne1 + block_dims[1] - 1) / block_dims[1],
+                (hne0 + block_dims[2] - 1) / block_dims[2]);
+
+            if (block_nums[0] > 65535) {
+                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
+                {
+                    dpct::has_capability_or_fail(stream->get_device(),
+                                                 {sycl::aspect::fp16});
+
+                    stream->parallel_for(
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
+                                              sycl::range<3>(1, 1, block_size),
+                                          sycl::range<3>(1, 1, block_size)),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            k_bin_bcast_unravel<bin_op>(
+                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
+                                ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
+                                s13, item_ct1);
+                        });
+                }
+            } else {
+                /*
+                DPCT1049:16: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                dpct::has_capability_or_fail(stream->get_device(),
+                                             {sycl::aspect::fp16});
+
+                stream->parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
+                                            ne2, ne3, ne10, ne11, ne12, ne13,
+                                            s1, s2, s3, s11, s12, s13,
+                                            item_ct1);
+                    });
+            }
+        }
+    }
+};
+
+static void acc_f32_cuda(const float *x, const float *y, float *dst,
+                         const int n_elements, const int ne10, const int ne11,
+                         const int ne12, const int nb1, const int nb2,
+                         const int offset, dpct::queue_ptr stream) {
+    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
+                    item_ct1);
+        });
+}
+
+static void gelu_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void silu_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            silu_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void gelu_quick_f32_cuda(const float *x, float *dst, const int k,
+                                dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_quick_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void tanh_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            tanh_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void relu_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            relu_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void leaky_relu_f32_cuda(const float *x, float *dst, const int k,
+                                const float negative_slope,
+                                dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
+        });
+}
+
+static void sqr_f32_cuda(const float *x, float *dst, const int k,
+                         dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sqr_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void norm_f32_cuda(const float *x, float *dst, const int ncols,
+                          const int nrows, const float eps,
+                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
+                sycl::range<1>(32), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
+                                            s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    } else {
+        const sycl::range<3> block_dims(1, 1, 1024);
+        /*
+        DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
+                sycl::range<1>(32), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        norm_f32<1024>(x, dst, ncols, eps, item_ct1,
+                                       s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    }
+}
+
+static void group_norm_f32_cuda(const float *x, float *dst,
+                                const int num_groups, const int group_size,
+                                const int ne_elements, dpct::queue_ptr stream) {
+    static const float eps = 1e-6f;
+    if (group_size < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            const float eps_ct4 = eps;
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        group_norm_f32<WARP_SIZE>(
+                            x, dst, group_size, ne_elements, eps_ct4, item_ct1,
+                            s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    } else {
+        const sycl::range<3> block_dims(1, 1, 1024);
+        /*
+        DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            const float eps_ct4 = eps;
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        group_norm_f32<1024>(x, dst, group_size, ne_elements,
+                                             eps_ct4, item_ct1,
+                                             s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    }
+}
+
+static void concat_f32_cuda(const float *x, const float *y, float *dst,
+                            const int ne0, int ne1, int ne2, int ne02,
+                            dpct::queue_ptr stream) {
+    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne2, ne1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            concat_f32(x, y, dst, ne0, ne02, item_ct1);
+        });
+}
+
+static void upscale_f32_cuda(const float *x, float *dst, const int ne00,
+                             const int ne01, const int ne02,
+                             const int scale_factor, dpct::queue_ptr stream) {
+    int ne0 = (ne00 * scale_factor);
+    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
+        });
+}
+
+static void pad_f32_cuda(const float *x, float *dst, const int ne00,
+                         const int ne01, const int ne02, const int ne0,
+                         const int ne1, const int ne2, dpct::queue_ptr stream) {
+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne2, ne1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
+        });
+}
+
+static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
+                              const int nrows, const float eps,
+                              dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        rms_norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
+                                                s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    } else {
+        const sycl::range<3> block_dims(1, 1, 1024);
+        /*
+        DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1,
+                                           s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    }
+}
+
+static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx,
+                                   const int ky, const int kx_padded,
+                                   dpct::queue_ptr stream) {
+    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const sycl::range<3> num_blocks(1, ky, block_num_x);
+    const sycl::range<3> block_size(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(num_blocks * block_size, block_size),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
+            });
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block_cuda(const void *__restrict__ vx,
+                                  dst_t *__restrict__ y, const int k,
+                                  dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(
+                sycl::range<3>(1, 1, num_blocks) *
+                    sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE),
+                sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
+            });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q2_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q3_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_K(vx, y, item_ct1);
+                             });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q5_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q6_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_TYPE_Q4_1:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_F32:
+            return dequantize_block_cuda<1, 1, convert_f32>;
+        default:
+            return nullptr;
+    }
+}
+
+static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_TYPE_Q4_1:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_F16:
+            return dequantize_block_cuda<1, 1, convert_f16>;
+        default:
+            return nullptr;
+    }
+}
+
+static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q2_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q3_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q4_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q5_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const sycl::range<3> block_dims(1, 1, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q6_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y,
+                                         float *dst, const int ncols,
+                                         const int nrows,
+                                         dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
+                                                          nrows, item_ct1);
+            });
+    }
+}
+
+static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
+                          vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
+                          vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK5_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
+                          vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK5_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
+                          vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK8_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
+                          vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
+                          vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
+                          vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
+                          vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
+                          vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
+                          vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+        nwarps = NWARPS_Q4_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+        nwarps = NWARPS_Q4_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+        nwarps = NWARPS_Q4_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+        nwarps = NWARPS_Q4_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:20: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
+                            tile_x_d_q4_0_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:21: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
+                            tile_x_d_q4_0_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+        nwarps = NWARPS_Q4_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+        nwarps = NWARPS_Q4_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+        nwarps = NWARPS_Q4_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+        nwarps = NWARPS_Q4_1_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:22: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
+                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:23: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
+                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+        nwarps = NWARPS_Q5_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+        nwarps = NWARPS_Q5_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+        nwarps = NWARPS_Q5_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+        nwarps = NWARPS_Q5_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:24: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
+                            tile_x_d_q5_0_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:25: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
+                            tile_x_d_q5_0_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+        nwarps = NWARPS_Q5_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+        nwarps = NWARPS_Q5_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+        nwarps = NWARPS_Q5_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+        nwarps = NWARPS_Q5_1_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
+                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_1<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
+                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+        nwarps = NWARPS_Q8_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+        nwarps = NWARPS_Q8_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q8_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+        nwarps = NWARPS_Q8_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q8_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+        nwarps = NWARPS_Q8_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q8_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
+                            tile_x_d_q8_0_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:29: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q8_0<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
+                            tile_x_d_q8_0_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+        nwarps = NWARPS_Q2_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+        nwarps = NWARPS_Q2_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q2_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+        nwarps = NWARPS_Q2_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q2_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+        nwarps = NWARPS_Q2_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:30: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q2_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:31: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q2_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+#if QK_K == 256
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+        nwarps = NWARPS_Q3_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+        nwarps = NWARPS_Q3_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q3_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+        nwarps = NWARPS_Q3_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q3_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+        nwarps = NWARPS_Q3_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q3_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
+                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q3_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
+                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+#endif
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+        nwarps = NWARPS_Q4_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+        nwarps = NWARPS_Q4_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+        nwarps = NWARPS_Q4_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+        nwarps = NWARPS_Q4_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q4_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+        nwarps = NWARPS_Q5_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+        nwarps = NWARPS_Q5_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+        nwarps = NWARPS_Q5_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+        nwarps = NWARPS_Q5_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q5_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    const int compute_capability = g_device_caps[id].cc;
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+        nwarps = NWARPS_Q6_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+        nwarps = NWARPS_Q6_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q6_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+        nwarps = NWARPS_Q6_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q6_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+        nwarps = NWARPS_Q6_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q6_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            stream->submit([&](sycl::handler &cgh) {
+                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
+                    cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
+                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
+                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
+
+                cgh.parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        mul_mat_q6_K<need_check>(
+                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
+                            nrows_dst, item_ct1,
+                            tile_x_ql_acc_ct1.get_pointer(),
+                            tile_x_dm_acc_ct1.get_pointer(),
+                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_y_qs_acc_ct1.get_pointer(),
+                            tile_y_ds_acc_ct1.get_pointer());
+                    });
+            });
+        }
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y,
+                                           float *dst, const int ncols_x,
+                                           const int nrows_x,
+                                           const int nchannels_x,
+                                           const int nchannels_y,
+                                           dpct::queue_ptr stream) {
+
+    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
+                                     nchannels_y, item_ct1);
+            });
+    }
+}
+
+static void ggml_mul_mat_vec_nc_f16_f32_cuda(
+    const void *vx, const float *y, float *dst, const int ncols_x,
+    const int nrows_x, const int row_stride_x, const int nchannels_x,
+    const int nchannels_y, const int channel_stride_x, dpct::queue_ptr stream) {
+
+    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
+                                       row_stride_x, channel_stride_x,
+                                       nchannels_y / nchannels_x, item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_q8_0_cuda(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int nb00, const int nb01,
+                                   const int nb02, const int ne10,
+                                   const int ne11, const int nb10,
+                                   const int nb11, const int nb12,
+                                   dpct::queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK8_0 == 0);
+    const int num_blocks = ne / QK8_0;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
+                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
+                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
+                         });
+}
+
+static void ggml_cpy_f32_q4_0_cuda(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int nb00, const int nb01,
+                                   const int nb02, const int ne10,
+                                   const int ne11, const int nb10,
+                                   const int nb11, const int nb12,
+                                   dpct::queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK4_0 == 0);
+    const int num_blocks = ne / QK4_0;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
+                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
+                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
+                         });
+}
+
+static void ggml_cpy_f32_q4_1_cuda(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int nb00, const int nb01,
+                                   const int nb02, const int ne10,
+                                   const int ne11, const int nb10,
+                                   const int nb11, const int nb12,
+                                   dpct::queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK4_1 == 0);
+    const int num_blocks = ne / QK4_1;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
+                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
+                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
+                         });
+}
+
+static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void scale_f32_cuda(const float *x, float *dst, const float scale,
+                           const int k, dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            scale_f32(x, dst, scale, k, item_ct1);
+        });
+}
+
+static void clamp_f32_cuda(const float *x, float *dst, const float min,
+                           const float max, const int k,
+                           dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            clamp_f32(x, dst, min, max, k, item_ct1);
+        });
+}
+
+template <typename T>
+static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
+                      const int32_t *pos, float freq_scale, int p_delta_rows,
+                      float freq_base, float ext_factor, float attn_factor,
+                      rope_corr_dims corr_dims, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % 2 == 0);
+    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
+    if (pos == nullptr) {
+        /*
+        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
+                               freq_base, ext_factor, attn_factor, corr_dims,
+                               item_ct1);
+            });
+    } else {
+        /*
+        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
+                              freq_base, ext_factor, attn_factor, corr_dims,
+                              item_ct1);
+            });
+    }
+}
+
+template <typename T>
+static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
+                           const int32_t *pos, float freq_scale,
+                           int p_delta_rows, float freq_base, float ext_factor,
+                           float attn_factor, rope_corr_dims corr_dims,
+                           dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % 2 == 0);
+    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.0f / n_dims;
+
+    if (pos == nullptr) {
+        /*
+        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale,
+                                    p_delta_rows, ext_factor, attn_factor,
+                                    corr_dims, theta_scale, inv_ndims,
+                                    item_ct1);
+            });
+    } else {
+        /*
+        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale,
+                                   p_delta_rows, ext_factor, attn_factor,
+                                   corr_dims, theta_scale, inv_ndims, item_ct1);
+            });
+    }
+}
+
+static void rope_glm_f32_cuda(const float *x, float *dst, int ncols, int nrows,
+                              const int32_t *pos, float freq_scale,
+                              int p_delta_rows, float freq_base, int n_ctx,
+                              dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % 4 == 0);
+    const sycl::range<3> block_dims(1, 1, CUDA_ROPE_BLOCK_SIZE / 4);
+    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
+    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             rope_glm_f32(x, dst, ncols, pos, freq_scale,
+                                          p_delta_rows, freq_base, n_ctx,
+                                          item_ct1);
+                         });
+}
+
+static void alibi_f32_cuda(const float *x, float *dst, const int ncols,
+                           const int nrows, const int k_rows,
+                           const int n_heads_log2_floor, const float m0,
+                           const float m1, dpct::queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, CUDA_ALIBI_BLOCK_SIZE);
+    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             alibi_f32(x, dst, ncols, k_rows,
+                                       n_heads_log2_floor, m0, m1, item_ct1);
+                         });
+}
+
+static void sum_rows_f32_cuda(const float *x, float *dst, const int ncols,
+                              const int nrows, dpct::queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1)
+                             [[intel::reqd_sub_group_size(32)]] {
+                                 k_sum_rows_f32(x, dst, ncols, item_ct1);
+                             });
+}
+
+static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
+                                 const int nrows, ggml_sort_order order,
+                                 dpct::queue_ptr stream) {
+    // bitonic sort requires ncols to be power of 2
+    GGML_ASSERT((ncols & (ncols - 1)) == 0);
+
+    const sycl::range<3> block_dims(1, 1, ncols);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    if (order == GGML_SORT_ASC) {
+        /*
+        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
+            });
+    } else if (order == GGML_SORT_DESC) {
+        /*
+        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
+            });
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+static void diag_mask_inf_f32_cuda(const float *x, float *dst,
+                                   const int ncols_x, const int nrows_x,
+                                   const int rows_per_channel, const int n_past,
+                                   dpct::queue_ptr stream) {
+    const sycl::range<3> block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
+    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
+    const sycl::range<3> block_nums(1, block_num_x, nrows_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             diag_mask_inf_f32(x, dst, ncols_x,
+                                               rows_per_channel, n_past,
+                                               item_ct1);
+                         });
+}
+
+static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
+                              const int ncols_x, const int nrows_x,
+                              const int nrows_y, const float scale,
+                              dpct::queue_ptr stream) {
+    int nth = WARP_SIZE;
+    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    const sycl::range<3> block_dims(1, 1, nth);
+    const sycl::range<3> block_nums(1, 1, nrows_x);
+    /*
+    DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    stream->submit([&](sycl::handler &cgh) {
+        /*
+        DPCT1101:96: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
+        replaced with a value. Modify the code to use the original expression,
+        provided in comments, if it is correct.
+        */
+        sycl::local_accessor<float, 1> buf_acc_ct1(
+            sycl::range<1>(32 /*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
+                             buf_acc_ct1.get_pointer());
+            });
+    });
+}
+
+static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH,
+                                int OW, int OH, int KW, int KH, int IC,
+                                int offset_delta, int s0, int s1, int p0,
+                                int p1, int d0, int d1,
+                                dpct::queue_ptr stream) {
+    const int parallel_elements = OW * KW * KH;
+    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
+    sycl::range<3> block_nums(IC, OH, num_blocks);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums *
+                                  sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH,
+                               parallel_elements, (IC * KH * KW), s0, s1, p0,
+                               p1, d0, d1, item_ct1);
+            });
+    }
+}
+
+// buffer pool for cuda
+#define MAX_CUDA_BUFFERS 256
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+// #define DEBUG_CUDA_MALLOC
+struct cuda_buffer {
+    void * ptr = nullptr;
+    size_t size = 0;
+};
+
+static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
+static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
+
+static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+#ifdef DEBUG_CUDA_MALLOC
+    int nnz = 0;
+    size_t max_size = 0;
+#endif
+    size_t best_diff = 1ull << 36;
+    int ibest = -1;
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+        if (b.ptr != nullptr) {
+#ifdef DEBUG_CUDA_MALLOC
+            ++nnz;
+            if (b.size > max_size) max_size = b.size;
+#endif
+            if (b.size >= size) {
+                size_t diff = b.size - size;
+                if (diff < best_diff) {
+                    best_diff = diff;
+                    ibest = i;
+                    if (!best_diff) {
+                        void * ptr = b.ptr;
+                        *actual_size = b.size;
+                        b.ptr = nullptr;
+                        b.size = 0;
+                        return ptr;
+                    }
+                }
+            }
+        }
+    }
+    if (ibest >= 0) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
+        void * ptr = b.ptr;
+        *actual_size = b.size;
+        b.ptr = nullptr;
+        b.size = 0;
+        return ptr;
+    }
+    void * ptr;
+    size_t look_ahead_size = (size_t) (1.05 * size);
+    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(
+                             look_ahead_size, dpct::get_in_order_queue())));
+    *actual_size = look_ahead_size;
+    g_cuda_pool_size[id] += look_ahead_size;
+#ifdef DEBUG_CUDA_MALLOC
+    fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
+            (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
+#endif
+    return ptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+        if (b.ptr == nullptr) {
+            b.ptr = ptr;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+    g_cuda_pool_size[id] -= size;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+#if !defined(GGML_USE_HIPBLAS)
+// pool with virtual memory
+/*
+DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.
+*/
+static std::vector<CUmemGenericAllocationHandle>
+    g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
+static dpct::device_ptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
+static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
+static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
+
+static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+
+    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
+    const size_t alignment = 128;
+    size = alignment * ((size + alignment - 1) / alignment);
+
+    size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
+
+    if (size > avail) {
+        // round up to the next multiple of the granularity
+        size_t reserve_size = size - avail;
+        const size_t granularity = g_device_caps[id].vmm_granularity;
+        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
+
+        GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
+
+        // allocate more physical memory
+        /*
+        DPCT1082:65: Migration of CUmemAllocationProp type is not supported.
+        */
+        CUmemAllocationProp prop = {};
+        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        prop.location.id = id;
+        /*
+        DPCT1082:66: Migration of CUmemGenericAllocationHandle type is not
+        supported.
+        */
+        CUmemGenericAllocationHandle handle;
+        /*
+        DPCT1007:69: Migration of cuMemCreate is not supported.
+        */
+        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
+
+        // reserve virtual address space (if not already reserved)
+        if (g_cuda_pool_addr[id] == 0) {
+            /*
+            DPCT1007:70: Migration of cuMemAddressReserve is not supported.
+            */
+            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id],
+                                         CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
+        }
+
+        // map at the end of the pool
+        /*
+        DPCT1007:71: Migration of cuMemMap is not supported.
+        */
+        CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
+                          reserve_size, 0, handle, 0));
+
+        // set access
+        /*
+        DPCT1082:72: Migration of CUmemAccessDesc type is not supported.
+        */
+        CUmemAccessDesc access = {};
+        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+        access.location.id = id;
+        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+        /*
+        DPCT1007:73: Migration of cuMemSetAccess is not supported.
+        */
+        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
+                                reserve_size, &access, 1));
+
+        // add to the pool
+        g_cuda_pool_handles[id].push_back(handle);
+        g_cuda_pool_size[id] += reserve_size;
+
+        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
+        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
+        //       (unsigned long long) (reserve_size/1024/1024));
+    }
+
+    GGML_ASSERT(g_cuda_pool_addr[id] != 0);
+
+    void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]);
+    *actual_size = size;
+    g_cuda_pool_used[id] += size;
+
+#ifdef DEBUG_CUDA_MALLOC
+    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
+#endif
+
+    return ptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+
+#ifdef DEBUG_CUDA_MALLOC
+    printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
+#endif
+
+    g_cuda_pool_used[id] -= size;
+
+    // all deallocations must be in reverse order of the allocations
+    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    if (g_device_caps[id].vmm) {
+        return ggml_cuda_pool_malloc_vmm(size, actual_size);
+    } else {
+        return ggml_cuda_pool_malloc_leg(size, actual_size);
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_pool_free(void *ptr, size_t size) try {
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+    if (g_device_caps[id].vmm) {
+        ggml_cuda_pool_free_vmm(ptr, size);
+    } else {
+        ggml_cuda_pool_free_leg(ptr, size);
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+#else
+#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
+#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
+#endif // !defined(GGML_USE_HIPBLAS)
+
+template<typename T>
+struct cuda_pool_alloc {
+    T * ptr = nullptr;
+    size_t actual_size = 0;
+
+    // size is in number of elements
+    T * alloc(size_t size) {
+        GGML_ASSERT(ptr == nullptr);
+        ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size);
+        return ptr;
+    }
+
+    cuda_pool_alloc(size_t size) {
+        alloc(size);
+    }
+
+    ~cuda_pool_alloc() {
+        if (ptr != nullptr) {
+            ggml_cuda_pool_free(ptr, actual_size);
+        }
+    }
+
+    T * get() {
+        return ptr;
+    }
+
+    cuda_pool_alloc() = default;
+    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
+    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
+    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
+    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
+};
+
+static bool g_cublas_loaded = false;
+
+bool ggml_cublas_loaded(void) {
+    return g_cublas_loaded;
+}
+
+void ggml_init_cublas() try {
+    static bool initialized = false;
+
+    if (!initialized) {
+
+#ifdef __HIP_PLATFORM_AMD__
+        // Workaround for a rocBLAS bug when using multiple graphics cards:
+        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+        rocblas_initialize();
+        CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+
+        if (DPCT_CHECK_ERROR(g_device_count =
+                                 dpct::dev_mgr::instance().device_count()) !=
+            0) {
+            initialized = true;
+            g_cublas_loaded = false;
+            return;
+        }
+
+        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
+        int64_t total_vram = 0;
+#if defined(GGML_CUDA_FORCE_MMQ)
+        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
+#endif
+#if defined(CUDA_USE_TENSOR_CORES)
+        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
+#endif
+        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
+        for (int id = 0; id < g_device_count; ++id) {
+            int device_vmm = 0;
+
+#if !defined(GGML_USE_HIPBLAS)
+            int device;
+            CU_CHECK(DPCT_CHECK_ERROR(device = id));
+            /*
+            DPCT1028:74: The cuDeviceGetAttribute was not migrated because
+            parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is
+            unsupported.
+            */
+            CU_CHECK(cuDeviceGetAttribute(
+                &device_vmm,
+                CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
+                device));
+
+            if (device_vmm) {
+                /*
+                DPCT1082:75: Migration of CUmemAllocationProp type is not
+                supported.
+                */
+                CUmemAllocationProp alloc_prop = {};
+                alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+                alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+                alloc_prop.location.id = id;
+                /*
+                DPCT1007:76: Migration of cuMemGetAllocationGranularity is not
+                supported.
+                */
+                CU_CHECK(cuMemGetAllocationGranularity(
+                    &g_device_caps[id].vmm_granularity, &alloc_prop,
+                    CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+            }
+#endif // !defined(GGML_USE_HIPBLAS)
+            g_device_caps[id].vmm = !!device_vmm;
+
+            dpct::device_info prop;
+            CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+                prop, dpct::dev_mgr::instance().get_device(id))));
+            /*
+            DPCT1005:77: The SYCL device version is different from CUDA Compute
+            Compatibility. You may need to rewrite this code.
+            */
+            fprintf(stderr,
+                    "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id,
+                    prop.get_name(), prop.get_major_version(),
+                    prop.get_minor_version(), device_vmm ? "yes" : "no");
+
+            g_tensor_split[id] = total_vram;
+            total_vram += prop.get_global_mem_size();
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
+#else
+            /*
+            DPCT1005:78: The SYCL device version is different from CUDA Compute
+            Compatibility. You may need to rewrite this code.
+            */
+            g_device_caps[id].cc =
+                100 * prop.get_major_version() + 10 * prop.get_minor_version();
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+        }
+        for (int id = 0; id < g_device_count; ++id) {
+            g_tensor_split[id] /= total_vram;
+        }
+
+        for (int id = 0; id < g_device_count; ++id) {
+            CUDA_CHECK(ggml_cuda_set_device(id));
+
+            // create cuda streams
+            for (int is = 0; is < MAX_STREAMS; ++is) {
+                /*
+                DPCT1025:79: The SYCL queue is created ignoring the flag and
+                priority options.
+                */
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    g_cudaStreams[id][is] =
+                        dpct::get_current_device().create_queue()));
+            }
+
+            // create cublas handle
+            CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
+                                              &dpct::get_in_order_queue()));
+            /*
+            DPCT1027:80: The call to cublasSetMathMode was replaced with 0
+            because this functionality is redundant in SYCL.
+            */
+            CUBLAS_CHECK(0);
+        }
+
+        // configure logging to stdout
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+
+        initialized = true;
+        g_cublas_loaded = true;
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_set_tensor_split(const float * tensor_split) {
+    if (tensor_split == nullptr) {
+        return;
+    }
+    bool all_zero = true;
+    for (int i = 0; i < g_device_count; ++i) {
+        if (tensor_split[i] != 0.0f) {
+            all_zero = false;
+            break;
+        }
+    }
+    if (all_zero) {
+        return;
+    }
+    float split_sum = 0.0f;
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] = split_sum;
+        split_sum += tensor_split[i];
+    }
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] /= split_sum;
+    }
+}
+
+void *ggml_cuda_host_malloc(size_t size) try {
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    dpct::err0 err = DPCT_CHECK_ERROR(
+        ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
+    /*
+    DPCT1000:82: Error handling if-stmt was detected but could not be rewritten.
+    */
+    if (err != 0) {
+        // clear the error
+        /*
+        DPCT1026:83: The call to cudaGetLastError was removed because this
+        functionality is redundant in SYCL.
+        */
+        /*
+        DPCT1001:81: The statement could not be removed.
+        */
+        fprintf(
+            stderr,
+            "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            /*
+            DPCT1009:84: SYCL uses exceptions to report errors and does not use
+            the error codes. The original code was commented out and a warning
+            string was inserted. You need to rewrite this code.
+            */
+            size / 1024.0 / 1024.0,
+            "cudaGetErrorString is not supported" /*cudaGetErrorString(err)*/);
+        return nullptr;
+    }
+
+    return ptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_host_free(void *ptr) try {
+    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
+                                          const struct ggml_tensor *src,
+                                          int64_t i3, int64_t i2,
+                                          int64_t i1_low, int64_t i1_high,
+                                          dpct::queue_ptr stream) try {
+
+    dpct::memcpy_direction kind;
+    char * src_ptr;
+    if (src->backend == GGML_BACKEND_CPU) {
+        kind = dpct::host_to_device;
+        src_ptr = (char *) src->data;
+    } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
+        GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
+        kind = dpct::device_to_device;
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
+        int id;
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            id = dpct::dev_mgr::instance().current_device_id()));
+        src_ptr = (char *) extra->data_device[id];
+    } else {
+        GGML_ASSERT(false);
+    }
+    char * dst_ptr = (char *) dst;
+
+    const int64_t ne0 = src->ne[0];
+    const int64_t nb0 = src->nb[0];
+    const int64_t nb1 = src->nb[1];
+    const int64_t nb2 = src->nb[2];
+    const int64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const int64_t ts = ggml_type_size(type);
+    const int64_t bs = ggml_blck_size(type);
+    int64_t i1_diff = i1_high - i1_low;
+
+    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        return DPCT_CHECK_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
+    } else if (nb0 == ts) {
+        return DPCT_CHECK_ERROR(
+            dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1,
+                                    ts * ne0 / bs, i1_diff, kind, *stream));
+    } else {
+        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
+            const void * rx = (const void *) ((const char *) x + i1*nb1);
+            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
+            // pretend the row is a matrix with cols=1
+            dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
+                rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
+            /*
+            DPCT1001:85: The statement could not be removed.
+            */
+            /*
+            DPCT1000:86: Error handling if-stmt was detected but could not be
+            rewritten.
+            */
+            if (r != 0) return r;
+        }
+        return 0;
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_op_get_rows(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_d, const float *src1_d,
+                                  float *dst_d, const dpct::queue_ptr &stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
+    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
+
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            get_rows_cuda_float(src0, src1, dst, (const sycl::half *)src0_d,
+                                src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_F32:
+            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        default:
+            // TODO: k-quants
+            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
+template <class op>
+inline void ggml_cuda_op_bin_bcast(const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd,
+                                   const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
+             (sycl::half *)dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
+             main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ASSERT(false);
+    }
+}
+
+static void ggml_cuda_op_repeat(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_d, const float *src1_d,
+                                float *dst_d,
+                                const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
+
+    (void) src1;
+    (void) src1_d;
+}
+
+inline void ggml_cuda_op_add(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_cuda_op_acc(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
+
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes
+
+    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
+
+    (void) dst;
+}
+
+inline void ggml_cuda_op_mul(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_cuda_op_div(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_cuda_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_silu(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_gelu_quick(const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_relu(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_leaky_relu(const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_norm(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_group_norm(const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int num_groups = dst->op_params[0];
+    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
+    group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_concat(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_dd, const float *src1_dd,
+                                float *dst_dd,
+                                const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
+    }
+
+    (void) src1;
+    (void) dst;
+}
+
+inline void ggml_cuda_op_upscale(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    const int scale_factor = dst->op_params[0];
+
+    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    pad_f32_cuda(src0_dd, dst_dd,
+        src0->ne[0], src0->ne[1], src0->ne[2],
+        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_rms_norm(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_mul_mat_q(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) try {
+
+    const int64_t ne00 = src0->ne[0];
+
+    const int64_t ne10 = src1->ne[0];
+    GGML_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
+    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static int64_t get_row_rounding(ggml_type type) {
+    int64_t min_compute_capability = INT_MAX;
+    int64_t max_compute_capability = INT_MIN;
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            if (min_compute_capability > g_device_caps[id].cc) {
+                min_compute_capability = g_device_caps[id].cc;
+            }
+            if (max_compute_capability < g_device_caps[id].cc) {
+                max_compute_capability = g_device_caps[id].cc;
+            }
+        }
+    }
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            return 1;
+        case GGML_TYPE_Q2_K:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
+        case GGML_TYPE_Q3_K:
+            return min_compute_capability < CC_RDNA2 ? 128 : 64;
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        default:
+            GGML_ASSERT(false);
+    }
+#else
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return 64;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            return 1;
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_TYPE_Q6_K:
+            return 64;
+        default:
+            GGML_ASSERT(false);
+    }
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+}
+
+inline void ggml_cuda_op_mul_mat_vec_q(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) {
+
+    GGML_ASSERT(ggml_nrows(src1) == 1);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+inline void ggml_cuda_op_dequantize_mul_mat_vec(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
+#ifdef GGML_CUDA_F16
+    cuda_pool_alloc<half> src1_dfloat_a;
+    half * src1_dfloat = nullptr; // dfloat == half
+
+    bool src1_convert_f16 =
+        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
+        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
+        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
+
+    if (src1_convert_f16) {
+        src1_dfloat = src1_dfloat_a.alloc(ne00);
+        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
+                                ne00, 1, sizeof(float), 0, 0,
+                                ne00, 1, sizeof(half),  0, 0, stream);
+    }
+#else
+    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
+#endif // GGML_CUDA_F16
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_F16:
+            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+inline void ggml_cuda_op_mul_mat_cublas(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) try {
+
+    GGML_ASSERT(src0_dd_i  != nullptr);
+    GGML_ASSERT(src1_ddf_i != nullptr);
+    GGML_ASSERT(dst_dd_i   != nullptr);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // ldc == nrows of the matrix that cuBLAS writes into
+    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    const int compute_capability = g_device_caps[id].cc;
+
+    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
+        cuda_pool_alloc<sycl::half> src0_as_f16;
+        if (src0->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = row_diff*ne00;
+            src0_as_f16.alloc(ne);
+            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
+        }
+        const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
+                                         ? (const sycl::half *)src0_dd_i
+                                         : src0_as_f16.get();
+
+        cuda_pool_alloc<sycl::half> src1_as_f16;
+        if (src1->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = src1_ncols*ne10;
+            src1_as_f16.alloc(ne);
+            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
+        }
+        const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
+                                         ? (const sycl::half *)src1_ddf_i
+                                         : src1_as_f16.get();
+        cuda_pool_alloc<sycl::half> dst_f16(row_diff * src1_ncols);
+
+        const sycl::half alpha_f16 = 1.0f;
+        const sycl::half beta_f16 = 0.0f;
+
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
+            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+            &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
+            src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
+            dst_f16.get(), dpct::library_data_t::real_half, ldc,
+            dpct::library_data_t::real_half)));
+
+        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+    }
+    else {
+        cuda_pool_alloc<float> src0_ddq_as_f32;
+
+        if (src0->type != GGML_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+            GGML_ASSERT(to_fp32_cuda != nullptr);
+            src0_ddq_as_f32.alloc(row_diff*ne00);
+            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
+        }
+        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
+
+        const float alpha = 1.0f;
+        const float beta = 0.0f;
+
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
+            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+            dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00,
+            src1_ddf_i, ne10, dpct::get_value(&beta, *g_cublas_handles[id]),
+            dst_dd_i, ldc)));
+    }
+
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_padded_row_size;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t nrows = ggml_nrows(src0);
+
+    //const int n_past      = ((int32_t *) dst->op_params)[0];
+    const int n_dims      = ((int32_t *) dst->op_params)[1];
+    const int mode        = ((int32_t *) dst->op_params)[2];
+    const int n_ctx       = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
+
+    // RoPE alteration for extended context
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    const int32_t * pos = nullptr;
+    if ((mode & 1) == 0) {
+        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        GGML_ASSERT(src1->ne[0] == ne2);
+        pos = (const int32_t *) src1_dd;
+    }
+
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+
+    rope_corr_dims corr_dims;
+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
+
+    // compute
+    if (is_glm) {
+        GGML_ASSERT(false);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
+    } else if (is_neox) {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_neox_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_neox_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
+                           ne00, n_dims, nrows, pos, freq_scale, ne01,
+                           freq_base, ext_factor, attn_factor, corr_dims,
+                           main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
+    } else {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
+                      nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                      attn_factor, corr_dims, main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t nrows = ggml_nrows(src0);
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_head = ((int32_t *) dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+    //GGML_ASSERT(ne01 + n_past == ne00);
+    GGML_ASSERT(n_head == ne02);
+
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
+
+    (void) src1;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_im2col(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_dd, const float *src1_dd,
+                                float *dst_dd,
+                                const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+
+    const int64_t IC = src1->ne[is_2D ? 2 : 1];
+    const int64_t IH = is_2D ? src1->ne[1] : 1;
+    const int64_t IW =         src1->ne[0];
+
+    const int64_t KH = is_2D ? src0->ne[1] : 1;
+    const int64_t KW =         src0->ne[0];
+
+    const int64_t OH = is_2D ? dst->ne[2] : 1;
+    const int64_t OW =         dst->ne[1];
+
+    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+
+    im2col_f32_f16_cuda(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
+                        IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
+
+    (void) src0;
+    (void) src0_dd;
+}
+
+inline void ggml_cuda_op_sum_rows(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_argsort(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_diag_mask_inf(const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst, const float *src0_dd,
+                                       const float *src1_dd, float *dst_dd,
+                                       const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int nrows0 = ggml_nrows(src0);
+
+    const int n_past = ((int32_t *) dst->op_params)[0];
+
+    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_soft_max(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows_x = ggml_nrows(src0);
+    const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
+
+    float scale = 1.0f;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
+
+    (void) dst;
+}
+
+inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float scale;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
+    /*
+    DPCT1010:87: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    CUDA_CHECK(0);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
+    /*
+    DPCT1010:88: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    CUDA_CHECK(0);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_cuda_op_flatten(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const ggml_cuda_op_flatten_t op) try {
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const bool use_src1 = src1 != nullptr;
+    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
+
+    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
+
+    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
+    const bool  dst_on_device =              dst->backend == GGML_BACKEND_GPU;
+
+    // dd = data device
+    float * src0_ddf = nullptr;
+    float * src1_ddf = nullptr;
+    float *  dst_ddf = nullptr;
+
+    cuda_pool_alloc<float> src0_f;
+    cuda_pool_alloc<float> src1_f;
+    cuda_pool_alloc<float>  dst_f;
+
+    ggml_cuda_set_device(g_main_device);
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    if (src0_on_device) {
+        src0_ddf = (float *) src0_extra->data_device[g_main_device];
+    } else {
+        src0_ddf = src0_f.alloc(ggml_nelements(src0));
+        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
+    }
+
+    if (use_src1) {
+        if (src1_on_device) {
+            src1_ddf = (float *) src1_extra->data_device[g_main_device];
+        } else {
+            src1_ddf = src1_f.alloc(ggml_nelements(src1));
+            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
+        }
+    }
+    if (dst_on_device) {
+        dst_ddf = (float *) dst_extra->data_device[g_main_device];
+    } else {
+        dst_ddf = dst_f.alloc(ggml_nelements(dst));
+    }
+
+    // do the computation
+    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
+    /*
+    DPCT1010:89: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    CUDA_CHECK(0);
+
+    // copy dst to host if necessary
+    if (!dst_on_device) {
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            dpct::get_current_device().queues_wait_and_throw()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_set_peer_access(const int n_tokens) {
+    static bool peer_access_enabled = false;
+
+    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
+
+    if (peer_access_enabled == enable_peer_access) {
+        return;
+    }
+
+#ifdef NDEBUG
+    for (int id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(ggml_cuda_set_device(id));
+        CUDA_CHECK(cudaDeviceSynchronize());
+    }
+
+    for (int id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(ggml_cuda_set_device(id));
+
+        for (int id_other = 0; id_other < g_device_count; ++id_other) {
+            if (id == id_other) {
+                continue;
+            }
+            if (id != g_main_device && id_other != g_main_device) {
+                continue;
+            }
+
+            int can_access_peer;
+            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            if (can_access_peer) {
+                if (enable_peer_access) {
+                    CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+                } else {
+                    CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
+                }
+            }
+        }
+    }
+#endif // NDEBUG
+
+    peer_access_enabled = enable_peer_access;
+}
+
+static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 ggml_cuda_op_mul_mat_t op,
+                                 const bool convert_src1_to_q8_1) try {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int64_t nrows1 = ggml_nrows(src1);
+
+    GGML_ASSERT(ne03 == ne13);
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    const int nb2 = dst->nb[2];
+    const int nb3 = dst->nb[3];
+
+    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
+
+    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
+
+    const int64_t i02_divisor = ne12 / ne02;
+
+    const size_t src0_ts = ggml_type_size(src0->type);
+    const size_t src0_bs = ggml_blck_size(src0->type);
+    const size_t q8_1_ts = sizeof(block_q8_1);
+    const size_t q8_1_bs = QK8_1;
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src0_is_contiguous = ggml_is_contiguous(src0);
+    const bool src1_is_contiguous = ggml_is_contiguous(src1);
+
+    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
+
+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+    GGML_ASSERT(!(split && ne02 > 1));
+    GGML_ASSERT(!(split && ne03 > 1));
+    GGML_ASSERT(!(split && ne02 < ne12));
+
+    // dd = data device
+    char  *  src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+    float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
+    char  * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
+    float *   dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+    // as = actual size
+    size_t  src0_as[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t   dst_as[GGML_CUDA_MAX_DEVICES] = {0};
+
+    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
+    int64_t row_high[GGML_CUDA_MAX_DEVICES];
+
+    int used_devices = 0;
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        // by default, use all rows
+        row_low[id]  = 0;
+        row_high[id] = ne01;
+
+        // for multi GPU, get the row boundaries from tensor split
+        // and round to mul_mat_q tile sizes
+        if (split) {
+            const int64_t rounding = get_row_rounding(src0->type);
+
+            if (id != 0) {
+                row_low[id]  = ne01*g_tensor_split[id];
+                if (row_low[id] < ne01) {
+                    row_low[id] -= row_low[id] % rounding;
+                }
+            }
+
+            if (id != g_device_count - 1) {
+                row_high[id]  = ne01*g_tensor_split[id + 1];
+                if (row_high[id] < ne01) {
+                    row_high[id] -= row_high[id] % rounding;
+                }
+            }
+        }
+    }
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
+
+        used_devices++;
+
+        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+
+        ggml_cuda_set_device(id);
+        const dpct::queue_ptr stream = g_cudaStreams[id][0];
+
+        if (src0_on_device && src0_is_contiguous) {
+            src0_dd[id] = (char *) src0_extra->data_device[id];
+        } else {
+            // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
+            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
+        }
+
+        if (src1_on_device && src1_is_contiguous) {
+            src1_ddf[id] = (float *) src1_extra->data_device[id];
+        } else {
+            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
+        }
+
+        if (convert_src1_to_q8_1) {
+            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
+
+            if (src1_on_device && src1_is_contiguous) {
+                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
+                /*
+                DPCT1010:90: SYCL uses exceptions to report errors and does not
+                use the error codes. The call was replaced with 0. You need to
+                rewrite this code.
+                */
+                CUDA_CHECK(0);
+            }
+        }
+
+        if (dst_on_device) {
+            dst_dd[id] = (float *) dst_extra->data_device[id];
+        } else {
+            const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
+            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
+        }
+    }
+
+    // if multiple devices are used they need to wait for the main device
+    // here an event is recorded that signals that the main device has finished calculating the input data
+    if (split && used_devices > 1) {
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        /*
+        DPCT1024:91: The original code returned the error code that was further
+        consumed by the program logic. This original code was replaced with 0.
+        You may need to rewrite the program logic consuming the error code.
+        */
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            *src0_extra->events[g_main_device][0] =
+                g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier()));
+    }
+
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
+        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
+        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
+
+        for (int64_t id = 0; id < g_device_count; ++id) {
+            if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+                continue;
+            }
+
+            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+            const int64_t row_diff = row_high[id] - row_low[id];
+
+            ggml_cuda_set_device(id);
+            const dpct::queue_ptr stream = g_cudaStreams[id][is];
+
+            // wait for main GPU data if necessary
+            if (split && (id != g_main_device || is != 0)) {
+                CUDA_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
+                    {*src0_extra->events[g_main_device][0]})));
+            }
+
+            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
+                const int64_t i03 = i0 / ne12;
+                const int64_t i02 = i0 % ne12;
+
+                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
+
+                // for split tensors the data begins at i0 == i0_offset_low
+                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
+                float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
+                char  * src1_ddq_i = src1_ddq[id] +  src1_ddq_i_offset;
+                float *   dst_dd_i =   dst_dd[id] + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+
+                // the main device memory buffer can be on VRAM scratch, with space for all partial results
+                // in that case an offset on dst_ddf_i is needed
+                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
+                    dst_dd_i += row_low[id]; // offset is 0 if no tensor split
+                }
+
+                // copy src0, src1 to device if necessary
+                if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
+                    if (id != g_main_device) {
+                        if (convert_src1_to_q8_1) {
+                            char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
+                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                                src1_ddq_i, src1_ddq_i_source,
+                                src1_ncols * src1_padded_col_size * q8_1_ts /
+                                    q8_1_bs)));
+                        } else {
+                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
+                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
+                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                                src1_ddf_i, src1_ddf_i_source,
+                                src1_ncols * ne10 * sizeof(float))));
+                        }
+                    }
+                } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
+                                   src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+                } else {
+                    GGML_ASSERT(false);
+                }
+
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
+                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    /*
+                    DPCT1010:92: SYCL uses exceptions to report errors and does
+                    not use the error codes. The call was replaced with 0. You
+                    need to rewrite this code.
+                    */
+                    CUDA_CHECK(0);
+                }
+
+                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
+                }
+
+                // do the computation
+                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
+                   row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
+                /*
+                DPCT1010:93: SYCL uses exceptions to report errors and does not
+                use the error codes. The call was replaced with 0. You need to
+                rewrite this code.
+                */
+                CUDA_CHECK(0);
+
+                // copy dst to host or other device if necessary
+                if (!dst_on_device) {
+                    void * dst_off_device;
+                    dpct::memcpy_direction kind;
+                    if (dst->backend == GGML_BACKEND_CPU) {
+                        dst_off_device = dst->data;
+                        kind = dpct::device_to_host;
+                    } else if (dst->backend == GGML_BACKEND_GPU) {
+                        dst_off_device = dst_extra->data_device[g_main_device];
+                        kind = dpct::device_to_device;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                    if (split) {
+                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+                        // dst is NOT transposed.
+                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
+                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0 + row_low[id];
+                        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
+                            dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
+                            row_diff * sizeof(float), row_diff * sizeof(float),
+                            src1_ncols, kind, *stream)));
+                    } else {
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0;
+                        CUDA_CHECK(DPCT_CHECK_ERROR(
+                            stream->memcpy(dhf_dst_i, dst_dd_i,
+                                           src1_ncols * ne0 * sizeof(float))));
+                    }
+                }
+
+                // add event for the main device to wait on until other device is done
+                if (split && (id != g_main_device || is != 0)) {
+                    /*
+                    DPCT1024:94: The original code returned the error code that
+                    was further consumed by the program logic. This original
+                    code was replaced with 0. You may need to rewrite the
+                    program logic consuming the error code.
+                    */
+                    CUDA_CHECK(DPCT_CHECK_ERROR(
+                        *src0_extra->events[id][is] =
+                            stream->ext_oneapi_submit_barrier()));
+                }
+            }
+        }
+    }
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
+        CUDA_CHECK(ggml_cuda_set_device(id));
+
+        // free buffers again when done
+        if (dst_as[id] > 0) {
+            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
+        }
+        if (src1_asq[id] > 0) {
+            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
+        }
+        if (src1_asf[id] > 0) {
+            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+        }
+        if (src0_as[id] > 0) {
+            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
+        }
+    }
+
+    // main device waits for all other devices to be finished
+    if (split && g_device_count > 1) {
+        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
+        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
+
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        for (int64_t id = 0; id < g_device_count; ++id) {
+            if (row_low[id] == row_high[id]) {
+                continue;
+            }
+            for (int64_t is = 0; is < is_max; ++is) {
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier(
+                        {*src0_extra->events[id][is]})));
+            }
+        }
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            dpct::get_current_device().queues_wait_and_throw()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
+}
+
+static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
+}
+
+static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
+}
+
+static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
+}
+
+static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
+}
+
+static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
+}
+
+static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
+}
+
+static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
+}
+
+static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
+}
+
+static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
+}
+
+static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
+}
+
+static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
+}
+
+static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
+}
+
+static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
+}
+
+static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
+}
+
+static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
+}
+
+static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
+}
+
+static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
+}
+
+static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
+}
+
+bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (!g_cublas_loaded) return false;
+
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+            src1->type == GGML_TYPE_F32 &&
+             dst->type == GGML_TYPE_F32 &&
+            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
+}
+
+static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst) try {
+    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
+    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
+                                     const ggml_tensor *src1,
+                                     ggml_tensor *dst) try {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+    GGML_ASSERT(!ggml_is_permuted(src0));
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    const int64_t row_stride_x = nb01 / sizeof(sycl::half);
+    const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
+
+    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
+                                   const sycl::half *src1_as_f16, char *dst,
+                                   const void **ptrs_src, void **ptrs_dst,
+                                   int64_t ne12, int64_t ne13, int64_t ne23,
+                                   size_t nb02, size_t nb03, size_t nb12,
+                                   size_t nb13, size_t nbd2, size_t nbd3,
+                                   int64_t r2, int64_t r3,
+                                   const sycl::nd_item<3> &item_ct1) {
+    int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                  item_ct1.get_local_id(2);
+    int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
+                  item_ct1.get_local_id(1);
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int64_t i03 = i13 / r3;
+    int64_t i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2   + i13*nbd3;
+}
+
+static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
+                                                 const ggml_tensor *src1,
+                                                 ggml_tensor *dst) try {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
+    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
+    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
+
+    const int64_t ne1 = ggml_nelements(src1);
+    const int64_t ne  = ggml_nelements(dst);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    CUBLAS_CHECK(
+        DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream));
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+    sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    // convert src1 to fp16
+    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+    GGML_ASSERT(to_fp16_cuda != nullptr);
+
+    cuda_pool_alloc<sycl::half> src1_as_f16(ne1);
+    to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
+
+    cuda_pool_alloc<sycl::half> dst_f16;
+    char * dst_t;
+
+    dpct::library_data_t cu_compute_type = CUBLAS_COMPUTE_16F;
+    dpct::library_data_t cu_data_type = dpct::library_data_t::real_half;
+
+    // dst strides
+    size_t nbd2 = dst->nb[2];
+    size_t nbd3 = dst->nb[3];
+
+    const sycl::half alpha_f16 = 1.0f;
+    const sycl::half beta_f16 = 0.0f;
+
+    const float alpha_f32 = 1.0f;
+    const float beta_f32  = 0.0f;
+
+    const void * alpha = &alpha_f16;
+    const void * beta  = &beta_f16;
+
+    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+        dst_t = (char *) dst_f16.alloc(ne);
+
+        nbd2 /= sizeof(float) / sizeof(sycl::half);
+        nbd3 /= sizeof(float) / sizeof(sycl::half);
+    } else {
+        dst_t = (char *) dst_ddf;
+
+        cu_compute_type = CUBLAS_COMPUTE_32F;
+        cu_data_type = dpct::library_data_t::real_float;
+
+        alpha = &alpha_f32;
+        beta  = &beta_f32;
+    }
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+#if 0
+    // use cublasGemmEx
+    {
+        for (int i13 = 0; i13 < ne13; ++i13) {
+            for (int i12 = 0; i12 < ne12; ++i12) {
+                int i03 = i13 / r3;
+                int i02 = i12 / r2;
+
+                CUBLAS_CHECK(
+                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+                            ne01, ne11, ne10,
+                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
+                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
+                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
+                            cu_compute_type,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            }
+        }
+    }
+#else
+    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
+        // use cublasGemmStridedBatchedEx
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
+            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
+            (const char *)src0_as_f16, dpct::library_data_t::real_half,
+            nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
+            (const char *)src1_as_f16.get(), dpct::library_data_t::real_half,
+            nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta,
+            (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float),
+            ne12 * ne13, cu_compute_type)));
+    } else {
+        // use cublasGemmBatchedEx
+        const int ne23 = ne12*ne13;
+
+        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
+        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);
+
+        sycl::range<3> block_dims(1, ne12, ne13);
+        /*
+        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(main_stream->get_device(),
+                                         {sycl::aspect::fp16});
+
+            main_stream->submit([&](sycl::handler &cgh) {
+                const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get();
+                const void **ptrs_src_get_ct3 = ptrs_src.get();
+                void **ptrs_dst_get_ct4 = ptrs_dst.get();
+
+                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
+                                 [=](sycl::nd_item<3> item_ct1) {
+                                     k_compute_batched_ptrs(
+                                         src0_as_f16, src1_as_f16_get_ct1,
+                                         dst_t, ptrs_src_get_ct3,
+                                         ptrs_dst_get_ct4, ne12, ne13, ne23,
+                                         nb02, nb03, nb12, nb13, nbd2, nbd3, r2,
+                                         r3, item_ct1);
+                                 });
+            });
+        }
+        /*
+        DPCT1010:95: SYCL uses exceptions to report errors and does not use the
+        error codes. The call was replaced with 0. You need to rewrite this
+        code.
+        */
+        CUDA_CHECK(0);
+
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
+            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
+            (const void **)(ptrs_src.get() + 0 * ne23),
+            dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
+            (const void **)(ptrs_src.get() + 1 * ne23),
+            dpct::library_data_t::real_half, nb11 / sizeof(float), beta,
+            (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
+            cu_compute_type)));
+    }
+#endif
+
+    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
+        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+        to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const bool all_on_device =
+        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
+        (src1->backend == GGML_BACKEND_GPU) &&
+        ( dst->backend == GGML_BACKEND_GPU);
+
+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+
+    int64_t min_compute_capability = INT_MAX;
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            min_compute_capability = g_device_caps[id].cc;
+        }
+    }
+
+#ifdef CUDA_USE_TENSOR_CORES
+    const bool use_tensor_cores = true;
+#else
+    const bool use_tensor_cores = false;
+#endif
+
+    // debug helpers
+    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
+    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
+
+    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+        // KQ single-batch
+        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
+    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+        // KQV single-batch
+        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
+    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+        // KQ + KQV multi-batch
+        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
+    } else if (src0->type == GGML_TYPE_F32) {
+        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
+#ifdef GGML_CUDA_FORCE_DMMV
+            const bool use_mul_mat_vec_q = false;
+#else
+            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
+#endif // GGML_CUDA_FORCE_DMMV
+
+            if (use_mul_mat_vec_q) {
+                // NOTE: this kernel does not support ggml_nrows(src1) > 1
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
+            } else {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
+            }
+        } else {
+            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+
+            // when tensor cores are available, use them for large batch size
+            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
+            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
+                use_mul_mat_q = false;
+            }
+
+            if (use_mul_mat_q) {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
+            } else {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+            }
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+#if 0
+template<typename ... Srcs>
+static __global__ void k_compute_batched_ptrs_id(
+        const void ** ptrs_src, void ** ptrs_dst,
+        int ne12, int ne13,
+        int ne23,
+        int nb02, int nb03,
+        int nb12, int nb13,
+        int nb2, int nb3,
+        int r2, int r3,
+        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
+        const half * src1_f16, half * dst_f16,
+        const int32_t * ids, const int id,
+        Srcs... src0s) {
+
+    int i = ids[id];
+
+    half * src0_f16;
+    const void * srcs_ar[] = { (const half *) src0s... };
+    if (src0_type == GGML_TYPE_F16) {
+        src0_f16 = (half *) srcs_ar[i];
+    } else {
+        src0_f16 = src0_as_f16;
+        if (threadIdx.x == 0 && threadIdx.y == 0) {
+            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
+            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
+        }
+    }
+
+    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
+    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int i03 = i13 / r3;
+    int i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
+}
+
+static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
+    const struct ggml_tensor * ids = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * src00 = dst->src[2];
+
+    const int id = dst->op_params[0];
+
+    GGML_ASSERT(!ggml_is_transposed(src00));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+
+    GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
+    const int64_t ne01 = src00->ne[1];
+    const int64_t ne02 = src00->ne[2];
+    const int64_t ne03 = src00->ne[3];
+
+    //const int64_t nb01 = src00->nb[1];
+    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
+    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    //const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
+    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
+
+    const int64_t ne1 = ggml_nelements(src1);
+    const int64_t ne  = ggml_nelements(dst);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
+
+    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    //void * src0_ddq = src0_extra->data_device[g_main_device];
+    //half * src0_as_f16 = (half *) src0_ddq;
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    // convert src1 to fp16
+    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+    GGML_ASSERT(to_fp16_cuda != nullptr);
+
+    size_t src1_as = 0;
+    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
+    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
+
+    size_t dst_as = 0;
+    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    const half alpha_f16 = 1.0f;
+    const half beta_f16  = 0.0f;
+
+    // use cublasGemmBatchedEx
+    const int ne23 = ne12*ne13;
+
+    const void ** ptrs_src = nullptr;
+          void ** ptrs_dst = nullptr;
+
+    size_t ptrs_src_s = 0;
+    size_t ptrs_dst_s = 0;
+
+    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
+    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
+
+    int64_t src0_ne = ggml_nelements(src00);
+    half * src0_as_f16 = nullptr;
+    size_t src0_as = 0;
+    if (src00->type != GGML_TYPE_F16) {
+        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
+    }
+
+    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
+    dim3 block_dims(ne13, ne12);
+    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
+            ptrs_src, ptrs_dst,
+            ne12, ne13,
+            ne23,
+            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
+            nb12, nb13,
+            dst->nb[2], dst->nb[3],
+            r2, r3,
+            src00->type, src0_as_f16, src0_ne,
+            src1_as_f16, dst_f16,
+            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
+            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    CUBLAS_CHECK(
+    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+            ne01, ne11, ne10,
+            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
+                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
+            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
+            ne23,
+            CUBLAS_COMPUTE_16F,
+            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    if (src0_as != 0) {
+        ggml_cuda_pool_free(src0_as_f16, src0_as);
+    }
+    if (ptrs_src_s != 0) {
+        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
+    }
+    if (ptrs_dst_s != 0) {
+        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
+    }
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+
+    ggml_cuda_pool_free(src1_as_f16, src1_as);
+    ggml_cuda_pool_free(dst_f16, dst_as);
+}
+#endif
+
+static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
+                                 const ggml_tensor *src1,
+                                 ggml_tensor *dst) try {
+#if 0
+    ggml_cuda_mul_mat_id_cublas(dst);
+    // TODO: mmq/mmv support
+#endif
+
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb1  =  dst->nb[1];
+
+    const struct ggml_tensor * ids = src0;
+    const int32_t id = ((int32_t *) dst->op_params)[0];
+    const int32_t n_as = ((int32_t *) dst->op_params)[1];
+
+    std::vector<char> ids_host(ggml_nbytes(ids));
+
+    const dpct::queue_ptr stream = g_cudaStreams[g_main_device][0];
+
+    if (ids->backend == GGML_BACKEND_GPU) {
+        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
+        CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
+    } else {
+        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
+    }
+
+    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
+    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
+
+    ggml_tensor_extra_gpu src1_row_extra;
+    ggml_tensor_extra_gpu dst_row_extra;
+
+    ggml_tensor src1_row = *src1;
+    ggml_tensor dst_row = *dst;
+
+    src1_row.backend = GGML_BACKEND_GPU;
+    dst_row.backend  = GGML_BACKEND_GPU;
+
+    src1_row.extra = &src1_row_extra;
+    dst_row.extra = &dst_row_extra;
+
+    char * src1_original = src1->backend == GGML_BACKEND_CPU ?
+        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
+    char * dst_original  =  dst->backend == GGML_BACKEND_CPU ?
+        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
+
+    if (src1->ne[1] == 1) {
+        GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+        GGML_ASSERT(dst->backend  == GGML_BACKEND_GPU);
+
+        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+            //int32_t row_id;
+            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+
+            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+            GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+
+            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
+            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
+
+            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
+            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
+
+            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+        }
+    } else {
+        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
+        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
+
+        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
+        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();
+
+        const dpct::memcpy_direction src1_kind =
+            src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device
+                                              : dpct::device_to_device;
+        const dpct::memcpy_direction dst_kind = dst->backend == GGML_BACKEND_CPU
+                                                    ? dpct::device_to_host
+                                                    : dpct::device_to_device;
+
+        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
+            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+
+            int64_t num_src1_rows = 0;
+            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+                if (row_id_i != row_id) {
+                    continue;
+                }
+
+                GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
+                                   src1_original + i01 * nb11, nb11)));
+                num_src1_rows++;
+            }
+
+            if (num_src1_rows == 0) {
+                continue;
+            }
+
+            src1_row.ne[1] = num_src1_rows;
+            dst_row.ne[1] = num_src1_rows;
+
+            src1_row.nb[1] = nb11;
+            src1_row.nb[2] = num_src1_rows*nb11;
+            src1_row.nb[3] = num_src1_rows*nb11;
+
+            dst_row.nb[1] = nb1;
+            dst_row.nb[2] = num_src1_rows*nb1;
+            dst_row.nb[3] = num_src1_rows*nb1;
+
+            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+
+            num_src1_rows = 0;
+            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+                if (row_id_i != row_id) {
+                    continue;
+                }
+
+                GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+                CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                    dst_original + i01 * nb1,
+                    dst_contiguous.get() + num_src1_rows * nb1, nb1)));
+                num_src1_rows++;
+            }
+        }
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
+}
+
+static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
+}
+
+static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst) try {
+    const int64_t ne = ggml_nelements(src0);
+    GGML_ASSERT(ne == ggml_nelements(src1));
+
+    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+
+    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
+    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    GGML_ASSERT(src0->ne[3] == 1);
+
+    const int64_t nb00 = src0->nb[0];
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    GGML_ASSERT(src1->ne[3] == 1);
+
+    const int64_t nb10 = src1->nb[0];
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+
+    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
+
+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
+                ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ASSERT(false);
+    }
+
+    (void) dst;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    // TODO: why do we pass dst as src1 here?
+    ggml_cuda_cpy(src0, dst, nullptr);
+    (void) src1;
+}
+
+static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
+}
+
+static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
+}
+
+static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
+}
+
+static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
+}
+
+static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
+}
+
+static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
+}
+
+static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
+}
+
+static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    (void) src0;
+    (void) src1;
+    (void) dst;
+}
+
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
+void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
+    const int64_t nrows = ggml_nrows(tensor);
+
+    const int64_t ne0 = tensor->ne[0];
+
+    const size_t nb1 = tensor->nb[1];
+
+    ggml_backend_type backend = tensor->backend;
+    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
+    memset(extra, 0, sizeof(*extra));
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (backend == GGML_BACKEND_GPU && id != g_main_device) {
+            continue;
+        }
+
+        ggml_cuda_set_device(id);
+
+        int64_t row_low, row_high;
+        if (backend == GGML_BACKEND_GPU) {
+            row_low = 0;
+            row_high = nrows;
+        } else if (backend == GGML_BACKEND_GPU_SPLIT) {
+            const int64_t rounding = get_row_rounding(tensor->type);
+
+            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
+            row_low -= row_low % rounding;
+
+            if (id == g_device_count - 1) {
+                row_high = nrows;
+            } else {
+                row_high = nrows*g_tensor_split[id + 1];
+                row_high -= row_high % rounding;
+            }
+        } else {
+            GGML_ASSERT(false);
+        }
+        if (row_low == row_high) {
+            continue;
+        }
+
+        int64_t nrows_split = row_high - row_low;
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        char * buf;
+        CUDA_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
+                                        size, dpct::get_in_order_queue())));
+        char * buf_host = (char *)data + offset_split;
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(DPCT_CHECK_ERROR(
+                dpct::get_in_order_queue()
+                    .memset(buf + original_size, 0, size - original_size)
+                    .wait()));
+        }
+
+        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                                        .memcpy(buf, buf_host, original_size)
+                                        .wait()));
+
+        extra->data_device[id] = buf;
+
+        if (backend == GGML_BACKEND_GPU_SPLIT) {
+            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+                CUDA_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
+                                                new sycl::event()));
+            }
+        }
+    }
+
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
+    if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (extra->data_device[id] != nullptr) {
+            CUDA_CHECK(ggml_cuda_set_device(id));
+            CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(
+                extra->data_device[id], dpct::get_in_order_queue())));
+        }
+
+        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+            if (extra->events[id][is] != nullptr) {
+                CUDA_CHECK(ggml_cuda_set_device(id));
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    dpct::destroy_event(extra->events[id][is])));
+            }
+        }
+    }
+
+    delete extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static size_t g_temp_tensor_extra_index = 0;
+
+static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+    if (g_temp_tensor_extras == nullptr) {
+        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
+    }
+
+    size_t alloc_index = g_temp_tensor_extra_index;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
+    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    memset(extra, 0, sizeof(*extra));
+
+    return extra;
+}
+
+static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
+                                          bool scratch, bool force_inplace,
+                                          bool no_alloc) try {
+    if (scratch && g_scratch_size == 0) {
+        return;
+    }
+
+    tensor->backend = GGML_BACKEND_GPU;
+
+    // recursively assign CUDA buffers until a compute tensor is found
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
+        const ggml_op src0_op = tensor->src[0]->op;
+        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
+            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
+        }
+    }
+    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
+        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
+    }
+
+    if (scratch && no_alloc) {
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra;
+
+    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
+        tensor->op == GGML_OP_VIEW ||
+        force_inplace;
+    const size_t size = ggml_nbytes(tensor);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t offset = 0;
+        if (tensor->op == GGML_OP_VIEW) {
+            memcpy(&offset, tensor->op_params, sizeof(size_t));
+        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src0_ddc + offset;
+    } else if (tensor->op == GGML_OP_CPY) {
+        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
+        void * src1_ddv = src1_extra->data_device[g_main_device];
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src1_ddv;
+    } else if (scratch) {
+        GGML_ASSERT(size <= g_scratch_size);
+        if (g_scratch_offset + size > g_scratch_size) {
+            g_scratch_offset = 0;
+        }
+
+        char * data = (char *) g_scratch_buffer;
+        if (data == nullptr) {
+            CUDA_CHECK(DPCT_CHECK_ERROR(
+                data = (char *)sycl::malloc_device(
+                    g_scratch_size, dpct::get_in_order_queue())));
+            g_scratch_buffer = data;
+        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = data + g_scratch_offset;
+
+        g_scratch_offset += size;
+
+        GGML_ASSERT(g_scratch_offset <= g_scratch_size);
+    } else { // allocate new buffers outside of scratch
+        void * data;
+        CUDA_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
+                                        size, dpct::get_in_order_queue())));
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            dpct::get_in_order_queue().memset(data, 0, size).wait()));
+        extra = new ggml_tensor_extra_gpu;
+        memset(extra, 0, sizeof(*extra));
+        extra->data_device[g_main_device] = data;
+    }
+
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
+                                     size_t offset) try {
+    if (g_scratch_size == 0) {
+        return;
+    }
+    if (g_scratch_buffer == nullptr) {
+        ggml_cuda_set_device(g_main_device);
+        CUDA_CHECK(
+            DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
+                                 g_scratch_size, dpct::get_in_order_queue())));
+    }
+
+    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
+
+    const bool inplace = tensor->view_src != nullptr;
+
+    if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t view_offset = 0;
+        if (tensor->op == GGML_OP_VIEW) {
+            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
+        }
+        extra->data_device[g_main_device] = src0_ddc + view_offset;
+    } else {
+        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
+    }
+
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(ggml_is_contiguous(tensor));
+
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                                    .memcpy(extra->data_device[g_main_device],
+                                            tensor->data, ggml_nbytes(tensor))
+                                    .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
+}
+
+void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, true, false, true);
+}
+
+void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, false, false, false);
+}
+
+void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, false, true, false);
+}
+
+void ggml_cuda_set_main_device(const int main_device) try {
+    if (main_device >= g_device_count) {
+        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
+                main_device, g_device_count, g_main_device);
+        return;
+    }
+
+    if (g_main_device != main_device && g_device_count > 1) {
+        g_main_device = main_device;
+        dpct::device_info prop;
+        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(g_main_device))));
+        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
+                g_main_device, prop.get_name());
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_set_scratch_size(const size_t scratch_size) {
+    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
+    // it still won't always work as expected, but it's better than nothing
+    if (scratch_size > g_scratch_size) {
+        ggml_cuda_free_scratch();
+    }
+    g_scratch_size = std::max(g_scratch_size, scratch_size);
+}
+
+void ggml_cuda_free_scratch() try {
+    if (g_scratch_buffer == nullptr) {
+        return;
+    }
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(
+        sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
+    g_scratch_buffer = nullptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+    if (!g_cublas_loaded) return false;
+
+    ggml_cuda_func_t func;
+    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
+
+    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
+        return false;
+    }
+
+    if (tensor->op == GGML_OP_MUL_MAT) {
+        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
+#ifndef NDEBUG
+            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
+#endif
+            return false;
+        }
+    }
+
+    switch (tensor->op) {
+        case GGML_OP_REPEAT:
+            func = ggml_cuda_repeat;
+            break;
+        case GGML_OP_GET_ROWS:
+            func = ggml_cuda_get_rows;
+            break;
+        case GGML_OP_DUP:
+            func = ggml_cuda_dup;
+            break;
+        case GGML_OP_ADD:
+            func = ggml_cuda_add;
+            break;
+        case GGML_OP_ACC:
+            func = ggml_cuda_acc;
+            break;
+        case GGML_OP_MUL:
+            func = ggml_cuda_mul;
+            break;
+        case GGML_OP_DIV:
+            func = ggml_cuda_div;
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_GELU:
+                    func = ggml_cuda_gelu;
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    func = ggml_cuda_silu;
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    func = ggml_cuda_gelu_quick;
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    func = ggml_cuda_tanh;
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    func = ggml_cuda_relu;
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            func = ggml_cuda_norm;
+            break;
+        case GGML_OP_GROUP_NORM:
+            func = ggml_cuda_group_norm;
+            break;
+        case GGML_OP_CONCAT:
+            func = ggml_cuda_concat;
+            break;
+        case GGML_OP_UPSCALE:
+            func = ggml_cuda_upscale;
+            break;
+        case GGML_OP_PAD:
+            func = ggml_cuda_pad;
+            break;
+        case GGML_OP_LEAKY_RELU:
+            func = ggml_cuda_leaky_relu;
+            break;
+        case GGML_OP_RMS_NORM:
+            func = ggml_cuda_rms_norm;
+            break;
+        case GGML_OP_MUL_MAT:
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_cuda_mul_mat;
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_cuda_mul_mat_id;
+            break;
+        case GGML_OP_SCALE:
+            func = ggml_cuda_scale;
+            break;
+        case GGML_OP_SQR:
+            func = ggml_cuda_sqr;
+            break;
+        case GGML_OP_CLAMP:
+            func = ggml_cuda_clamp;
+            break;
+        case GGML_OP_CPY:
+            func = ggml_cuda_cpy;
+            break;
+        case GGML_OP_CONT:
+            func = ggml_cuda_dup;
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            func = ggml_cuda_nop;
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            func = ggml_cuda_diag_mask_inf;
+            break;
+        case GGML_OP_SOFT_MAX:
+            func = ggml_cuda_soft_max;
+            break;
+        case GGML_OP_ROPE:
+            func = ggml_cuda_rope;
+            break;
+        case GGML_OP_ALIBI:
+            func = ggml_cuda_alibi;
+            break;
+        case GGML_OP_IM2COL:
+            func = ggml_cuda_im2col;
+            break;
+        case GGML_OP_SUM_ROWS:
+            func = ggml_cuda_sum_rows;
+            break;
+        case GGML_OP_ARGSORT:
+            func = ggml_cuda_argsort;
+            break;
+        default:
+            return false;
+    }
+
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
+        ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
+    }
+
+    if (params->ith != 0) {
+        return true;
+    }
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return true;
+    }
+    func(tensor->src[0], tensor->src[1], tensor);
+    return true;
+}
+
+int ggml_cuda_get_device_count() try {
+    int device_count;
+    if (DPCT_CHECK_ERROR(device_count =
+                             dpct::dev_mgr::instance().device_count()) != 0) {
+        return 0;
+    }
+    return device_count;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_get_device_description(int device, char *description,
+                                      size_t description_size) try {
+    dpct::device_info prop;
+    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+        prop, dpct::dev_mgr::instance().get_device(device))));
+    snprintf(description, description_size, "%s", prop.get_name());
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+#define UNUSED GGML_UNUSED
+
+// cuda buffer
+
+struct ggml_backend_buffer_context_cuda {
+    int device;
+    void * dev_ptr = nullptr;
+    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
+    size_t temp_tensor_extra_index = 0;
+
+    ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
+
+    ~ggml_backend_buffer_context_cuda() {
+        delete[] temp_tensor_extras;
+    }
+
+    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+        if (temp_tensor_extras == nullptr) {
+            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
+        }
+
+        size_t alloc_index = temp_tensor_extra_index;
+        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
+        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
+        memset(extra, 0, sizeof(*extra));
+
+        return extra;
+    }
+};
+
+static void
+ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
+    delete ctx;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    return ctx->dev_ptr;
+}
+
+static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
+                                                 ggml_tensor *tensor) try {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        assert(tensor->view_src->buffer->buft == buffer->buft);
+        tensor->backend = tensor->view_src->backend;
+        tensor->extra = tensor->view_src->extra;
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
+
+    extra->data_device[ctx->device] = tensor->data;
+
+    tensor->backend = GGML_BACKEND_GPU;
+    tensor->extra = extra;
+
+    if (ggml_is_quantized(tensor->type)) {
+        // initialize padding to 0 to avoid possible NaN values
+        int64_t row_low = 0;
+        int64_t row_high = ggml_nrows(tensor);
+        int64_t nrows_split = row_high - row_low;
+
+        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
+                (char *)tensor->data + original_size, 0,
+                padded_size - original_size)));
+        }
+    }
+
+    UNUSED(buffer);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                ggml_tensor *tensor,
+                                                const void *data, size_t offset,
+                                                size_t size) try {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                             .memcpy((char *)tensor->data + offset, data, size)
+                             .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                const ggml_tensor *tensor,
+                                                void *data, size_t offset,
+                                                size_t size) try {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(
+        dpct::get_in_order_queue()
+            .memcpy(data, (const char *)tensor->data + offset, size)
+            .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer,
+                                           uint8_t value) try {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    ggml_cuda_set_device(ctx->device);
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                                    .memset(ctx->dev_ptr, value, buffer->size)
+                                    .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
+    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
+    /* .cpy_tensor_from = */ NULL,
+    /* .cpy_tensor_to   = */ NULL,
+    /* .clear           = */ ggml_backend_cuda_buffer_clear,
+};
+
+// cuda buffer type
+
+static ggml_backend_buffer_t
+ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                           size_t size) try {
+    int device = (int) (intptr_t) buft->context;
+
+    ggml_cuda_set_device(device);
+
+    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
+
+    void * dev_ptr;
+    CUDA_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
+                                    size, dpct::get_in_order_queue())));
+
+    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
+
+    return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+
+    UNUSED(buft);
+}
+
+static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
+    int64_t row_low = 0;
+    int64_t row_high = ggml_nrows(tensor);
+    int64_t nrows_split = row_high - row_low;
+
+    size_t size = ggml_nbytes_split(tensor, nrows_split);
+
+    int64_t ne0 = tensor->ne[0];
+
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return size;
+
+    UNUSED(buft);
+}
+
+static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    return ggml_backend_is_cuda(backend);
+
+    UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
+    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
+    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
+    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
+    /* .is_host          = */ nullptr,
+};
+
+ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
+    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
+
+    static bool ggml_backend_cuda_buffer_type_initialized = false;
+
+    if (!ggml_backend_cuda_buffer_type_initialized) {
+        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
+            ggml_backend_cuda_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
+                /* .context  = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
+            };
+        }
+        ggml_backend_cuda_buffer_type_initialized = true;
+    }
+
+    return &ggml_backend_cuda_buffer_types[device];
+}
+
+// host buffer type
+
+static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_cuda_host_free(buffer->context);
+}
+
+static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr = ggml_cuda_host_malloc(size);
+
+    if (ptr == nullptr) {
+        // fallback to cpu buffer
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    // FIXME: this is a hack to avoid having to implement a new buffer type
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
+
+    return buffer;
+}
+
+ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
+        /* .iface    = */ {
+            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
+            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
+            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+        },
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_cuda_buffer_type_host;
+}
+
+// backend
+
+struct ggml_backend_context_cuda {
+    int device;
+};
+
+static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
+    return GGML_CUDA_NAME;
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_free(ggml_backend_t backend) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    delete cuda_ctx;
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
+}
+
+static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
+                                               ggml_tensor *tensor,
+                                               const void *data, size_t offset,
+                                               size_t size) try {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+        (char *)tensor->data + offset, data, size)));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
+                                               const ggml_tensor *tensor,
+                                               void *data, size_t offset,
+                                               size_t size) try {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+        data, (const char *)tensor->data + offset, size)));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
+
+    UNUSED(backend);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    GGML_ASSERT(!"not implemented");
+
+    return nullptr;
+
+    UNUSED(backend);
+    UNUSED(cgraph);
+}
+
+static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    ggml_cuda_set_main_device(cuda_ctx->device);
+
+    ggml_compute_params params = {};
+    params.type = GGML_TASK_COMPUTE;
+    params.ith = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+
+        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
+            continue;
+
+        assert(node->backend == GGML_BACKEND_GPU);
+        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+        assert(node->extra != nullptr);
+
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j] != nullptr) {
+                assert(node->src[j]->backend == GGML_BACKEND_GPU);
+                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+                assert(node->src[j]->extra != nullptr);
+            }
+        }
+
+        bool ok = ggml_cuda_compute_forward(&params, node);
+        if (!ok) {
+            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+
+#if 0
+        if (node->type == GGML_TYPE_F32) {
+            cudaDeviceSynchronize();
+            std::vector<float> tmp(ggml_nelements(node), 0.0f);
+            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
+            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
+                ggml_type_name(node->src[0]->type),
+                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
+                node->src[0]->name,
+                node->src[1] ? node->src[1]->name : "none");
+            double sum = 0.0;
+            double sq_sum = 0.0;
+            for (int i = 0; i < ggml_nelements(node); i++) {
+                printf("%f ", tmp[i]);
+                sum += tmp[i];
+                sq_sum += tmp[i]*tmp[i];
+            }
+            printf("\n");
+            printf("sum: %f, ", sum);
+            printf("sq_sum: %f\n", sq_sum);
+        }
+#endif
+    }
+
+    UNUSED(backend);
+}
+
+static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_TANH:
+                    return true;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            {
+                struct ggml_tensor * a;
+                struct ggml_tensor * b;
+                if (op->op == GGML_OP_MUL_MAT) {
+                    a = op->src[0];
+                    b = op->src[1];
+                } else {
+                    a = op->src[2];
+                    b = op->src[1];
+                }
+                if (a->ne[3] != b->ne[3]) {
+                    return false;
+                }
+                return true;
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                        return true;
+                    default:
+                        return false;
+                }
+            } break;
+        case GGML_OP_CPY:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1]->type;
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                return false;
+            } break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_NORM:
+        case GGML_OP_REPEAT:
+        case GGML_OP_DUP:
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_CLAMP:
+        case GGML_OP_CONT:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ROPE:
+        case GGML_OP_ALIBI:
+        case GGML_OP_IM2COL:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ACC:
+        case GGML_OP_CONCAT:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_LEAKY_RELU:
+            return true;
+        default:
+            return false;
+    }
+
+    UNUSED(backend);
+}
+
+static ggml_backend_i cuda_backend_i = {
+    /* .get_name                = */ ggml_backend_cuda_name,
+    /* .free                    = */ ggml_backend_cuda_free,
+    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
+    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
+    /* .cpy_tensor_from_async   = */ NULL,
+    /* .cpy_tensor_to_async     = */ NULL,
+    /* .synchronize             = */ ggml_backend_cuda_synchronize,
+    /* .graph_plan_create       = */ ggml_backend_cuda_graph_plan_create,
+    /* .graph_plan_free         = */ ggml_backend_cuda_graph_plan_free,
+    /* .graph_plan_compute      = */ ggml_backend_cuda_graph_plan_compute,
+    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
+    /* .supports_op             = */ ggml_backend_cuda_supports_op,
+};
+
+ggml_backend_t ggml_backend_cuda_init(int device) {
+    ggml_init_cublas(); // TODO: remove from ggml.c
+
+    if (device < 0 || device >= ggml_cuda_get_device_count()) {
+        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
+        return nullptr;
+    }
+
+    // not strictly necessary, but it may reduce the overhead of the first graph_compute
+    ggml_cuda_set_main_device(device);
+
+    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
+        /* .device = */ device
+    };
+
+    ggml_backend_t cuda_backend = new ggml_backend {
+        /* .interface = */ cuda_backend_i,
+        /* .context   = */ ctx
+    };
+
+    return cuda_backend;
+}
+
+bool ggml_backend_is_cuda(ggml_backend_t backend) {
+    return backend->iface.get_name == ggml_backend_cuda_name;
+}
+
+static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
+    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
+    return cuda_backend;
+
+    UNUSED(params);
+}
+
+extern "C" int ggml_backend_cuda_reg_devices();
+
+int ggml_backend_cuda_reg_devices() {
+    int device_count = ggml_cuda_get_device_count();
+    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
+    for (int i = 0; i < device_count; i++) {
+        char name[128];
+        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
+        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
+    }
+    return device_count;
+}
diff --git a/dpcpp_out2/ggml-cuda.h b/dpcpp_out2/ggml-cuda.h
new file mode 100644
index 0000000000000..cdb0c0c41618a
--- /dev/null
+++ b/dpcpp_out2/ggml-cuda.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef GGML_USE_HIPBLAS
+#define GGML_CUDA_NAME "ROCm"
+#define GGML_CUBLAS_NAME "hipBLAS"
+#else
+#define GGML_CUDA_NAME "CUDA"
+#define GGML_CUBLAS_NAME "cuBLAS"
+#endif
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_CUDA_MAX_DEVICES       16
+
+// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
+GGML_API void   ggml_init_cublas(void);
+
+// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
+GGML_API bool   ggml_cublas_loaded(void);
+
+GGML_API void * ggml_cuda_host_malloc(size_t size);
+GGML_API void   ggml_cuda_host_free(void * ptr);
+
+GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
+GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
+GGML_API void   ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_cuda_set_main_device(int main_device);
+GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
+GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
+GGML_API void   ggml_cuda_free_scratch(void);
+GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+GGML_API int    ggml_cuda_get_device_count(void);
+GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
+
+// backend API
+GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
+
+GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
+GGML_API int  ggml_backend_cuda_get_device(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
+
+// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/dpcpp_out2/ggml.h b/dpcpp_out2/ggml.h
new file mode 100644
index 0000000000000..5bb5323434e01
--- /dev/null
+++ b/dpcpp_out2/ggml.h
@@ -0,0 +1,2253 @@
+#pragma once
+
+//
+// GGML Tensor Library
+//
+// This documentation is still a work in progress.
+// If you wish some specific topics to be covered, feel free to drop a comment:
+//
+//   https://github.com/ggerganov/whisper.cpp/issues/40
+//
+// ## Overview
+//
+// This library implements:
+//
+//  - a set of tensor operations
+//  - automatic differentiation
+//  - basic optimization algorithms
+//
+// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
+// but is not limited to, the following:
+//
+//  - linear regression
+//  - support vector machines
+//  - neural networks
+//
+// The library allows the user to define a certain function using the available tensor operations. This function
+// definition is represented internally via a computation graph. Each tensor operation in the function definition
+// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
+// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
+// using one of the available optimization algorithms.
+//
+// For example, here we define the function: f(x) = a*x^2 + b
+//
+//   {
+//       struct ggml_init_params params = {
+//           .mem_size   = 16*1024*1024,
+//           .mem_buffer = NULL,
+//       };
+//
+//       // memory allocation happens here
+//       struct ggml_context * ctx = ggml_init(params);
+//
+//       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//
+//       ggml_set_param(ctx, x); // x is an input variable
+//
+//       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
+//       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
+//       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
+//
+//       ...
+//   }
+//
+// Notice that the function definition above does not involve any actual computation. The computation is performed only
+// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
+//
+//   {
+//       ...
+//
+//       struct ggml_cgraph * gf = ggml_new_graph(ctx);
+//       ggml_build_forward_expand(gf, f);
+//
+//       // set the input variable and parameter values
+//       ggml_set_f32(x, 2.0f);
+//       ggml_set_f32(a, 3.0f);
+//       ggml_set_f32(b, 4.0f);
+//
+//       ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
+//
+//       printf("f = %f\n", ggml_get_f32_1d(f, 0));
+//
+//       ...
+//   }
+//
+// The actual computation is performed in the ggml_graph_compute() function.
+//
+// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
+// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
+// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
+// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
+// actually needed.
+//
+// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
+// differentiation and optimization algorithms.
+//
+// The described approach allows to define the function graph once and then compute its forward or backward graphs
+// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
+// the user can avoid the memory allocation overhead at runtime.
+//
+// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
+// citizens, but in theory the library can be extended to support FP8 and integer data types.
+//
+// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
+// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
+// clear that the library needs to support more complex operations. The way to support these operations is not clear
+// yet, but a few examples are demonstrated in the following operations:
+//
+//   - ggml_permute()
+//   - ggml_conv_1d_1s()
+//   - ggml_conv_1d_2s()
+//
+// For each tensor operator, the library implements a forward and backward computation function. The forward function
+// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
+// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
+// calculus class, or watch the following video:
+//
+//   What is Automatic Differentiation?
+//   https://www.youtube.com/watch?v=wG_nF1awSSY
+//
+//
+// ## Tensor data (struct ggml_tensor)
+//
+// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
+// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
+// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
+//
+//   {
+//       struct ggml_tensor * c = ggml_add(ctx, a, b);
+//
+//       assert(c->src[0] == a);
+//       assert(c->src[1] == b);
+//   }
+//
+// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
+// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
+// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
+// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
+// contiguous in memory.
+//
+// The data of the tensor is accessed via the "data" pointer. For example:
+//
+//   {
+//       const int nx = 2;
+//       const int ny = 3;
+//
+//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
+//
+//       for (int y = 0; y < ny; y++) {
+//           for (int x = 0; x < nx; x++) {
+//               *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
+//           }
+//       }
+//
+//       ...
+//   }
+//
+// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
+//
+// ## The matrix multiplication operator (ggml_mul_mat)
+//
+// TODO
+//
+//
+// ## Multi-threading
+//
+// TODO
+//
+//
+// ## Overview of ggml.c
+//
+// TODO
+//
+//
+// ## SIMD optimizations
+//
+// TODO
+//
+//
+// ## Debugging ggml
+//
+// TODO
+//
+//
+
+#ifdef GGML_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BUILD
+#            define GGML_API __declspec(dllexport)
+#        else
+#            define GGML_API __declspec(dllimport)
+#        endif
+#    else
+#        define GGML_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define GGML_API
+#endif
+
+// TODO: support for clang
+#ifdef __GNUC__
+#    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
+#elif defined(_MSC_VER)
+#    define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
+#else
+#    define GGML_DEPRECATED(func, hint) func
+#endif
+
+#ifndef __GNUC__
+#    define GGML_ATTRIBUTE_FORMAT(...)
+#elif defined(__MINGW32__)
+#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+
+#define DPCT_PROFILING_ENABLED
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>
+
+#define GGML_FILE_MAGIC   0x67676d6c // "ggml"
+#define GGML_FILE_VERSION 1
+
+#define GGML_QNT_VERSION        2    // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
+#define GGML_MAX_DIMS           4
+#define GGML_MAX_PARAMS         2048
+#define GGML_MAX_CONTEXTS       64
+#define GGML_MAX_SRC            10
+#define GGML_MAX_NAME           64
+#define GGML_MAX_OP_PARAMS      64
+#define GGML_DEFAULT_N_THREADS  4
+#define GGML_DEFAULT_GRAPH_SIZE 2048
+#if UINTPTR_MAX == 0xFFFFFFFF
+    #define GGML_MEM_ALIGN 4
+#else
+    #define GGML_MEM_ALIGN 16
+#endif
+
+#define GGML_EXIT_SUCCESS 0
+#define GGML_EXIT_ABORTED 1
+
+#define GGUF_MAGIC "GGUF"
+
+#define GGUF_VERSION 3
+
+#define GGUF_DEFAULT_ALIGNMENT 32
+
+#define GGML_UNUSED(x) (void)(x)
+
+#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
+
+#define GGML_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fflush(stdout); \
+            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            ggml_print_backtrace(); \
+            abort(); \
+        } \
+    } while (0)
+
+#ifndef NDEBUG
+#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
+#elif defined(__GNUC__)
+#define GGML_UNREACHABLE() __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define GGML_UNREACHABLE() __assume(0)
+#else
+#define GGML_UNREACHABLE() ((void) 0)
+#endif
+
+// used to copy the number of elements and stride in bytes of tensors into local variables.
+// main purpose is to reduce code duplication and improve readability.
+//
+// example:
+//
+//    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
+//    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
+//
+#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+    const type prefix##0 = (pointer)->array[0]; \
+    GGML_UNUSED(prefix##0);
+#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
+    const type prefix##1 = (pointer)->array[1]; \
+    GGML_UNUSED(prefix##1);
+#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
+    const type prefix##2 = (pointer)->array[2]; \
+    GGML_UNUSED(prefix##2);
+#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
+    const type prefix##3 = (pointer)->array[3]; \
+    GGML_UNUSED(prefix##3);
+
+#define GGML_TENSOR_UNARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#if defined(__ARM_NEON) && defined(SYCL_LANGUAGE_VERSION)
+    typedef half ggml_fp16_t;
+#elif defined(__ARM_NEON) && !defined(_MSC_VER)
+    typedef __fp16 ggml_fp16_t;
+#else
+    typedef uint16_t ggml_fp16_t;
+#endif
+
+    // convert FP16 <-> FP32
+    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
+    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
+
+    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
+    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
+
+    struct ggml_object;
+    struct ggml_context;
+
+    enum ggml_type {
+        GGML_TYPE_F32  = 0,
+        GGML_TYPE_F16  = 1,
+        GGML_TYPE_Q4_0 = 2,
+        GGML_TYPE_Q4_1 = 3,
+        // GGML_TYPE_Q4_2 = 4, support has been removed
+        // GGML_TYPE_Q4_3 (5) support has been removed
+        GGML_TYPE_Q5_0 = 6,
+        GGML_TYPE_Q5_1 = 7,
+        GGML_TYPE_Q8_0 = 8,
+        GGML_TYPE_Q8_1 = 9,
+        // k-quantizations
+        GGML_TYPE_Q2_K = 10,
+        GGML_TYPE_Q3_K = 11,
+        GGML_TYPE_Q4_K = 12,
+        GGML_TYPE_Q5_K = 13,
+        GGML_TYPE_Q6_K = 14,
+        GGML_TYPE_Q8_K = 15,
+        GGML_TYPE_I8,
+        GGML_TYPE_I16,
+        GGML_TYPE_I32,
+        GGML_TYPE_COUNT,
+    };
+
+    // precision
+    enum ggml_prec {
+        GGML_PREC_DEFAULT,
+        GGML_PREC_F32,
+    };
+
+    enum ggml_backend_type {
+        GGML_BACKEND_CPU = 0,
+        GGML_BACKEND_GPU = 10,
+        GGML_BACKEND_GPU_SPLIT = 20,
+    };
+
+    // model file types
+    enum ggml_ftype {
+        GGML_FTYPE_UNKNOWN     = -1,
+        GGML_FTYPE_ALL_F32     = 0,
+        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
+    };
+
+    // available tensor operations:
+    enum ggml_op {
+        GGML_OP_NONE = 0,
+
+        GGML_OP_DUP,
+        GGML_OP_ADD,
+        GGML_OP_ADD1,
+        GGML_OP_ACC,
+        GGML_OP_SUB,
+        GGML_OP_MUL,
+        GGML_OP_DIV,
+        GGML_OP_SQR,
+        GGML_OP_SQRT,
+        GGML_OP_LOG,
+        GGML_OP_SUM,
+        GGML_OP_SUM_ROWS,
+        GGML_OP_MEAN,
+        GGML_OP_ARGMAX,
+        GGML_OP_REPEAT,
+        GGML_OP_REPEAT_BACK,
+        GGML_OP_CONCAT,
+        GGML_OP_SILU_BACK,
+        GGML_OP_NORM, // normalize
+        GGML_OP_RMS_NORM,
+        GGML_OP_RMS_NORM_BACK,
+        GGML_OP_GROUP_NORM,
+
+        GGML_OP_MUL_MAT,
+        GGML_OP_MUL_MAT_ID,
+        GGML_OP_OUT_PROD,
+
+        GGML_OP_SCALE,
+        GGML_OP_SET,
+        GGML_OP_CPY,
+        GGML_OP_CONT,
+        GGML_OP_RESHAPE,
+        GGML_OP_VIEW,
+        GGML_OP_PERMUTE,
+        GGML_OP_TRANSPOSE,
+        GGML_OP_GET_ROWS,
+        GGML_OP_GET_ROWS_BACK,
+        GGML_OP_DIAG,
+        GGML_OP_DIAG_MASK_INF,
+        GGML_OP_DIAG_MASK_ZERO,
+        GGML_OP_SOFT_MAX,
+        GGML_OP_SOFT_MAX_BACK,
+        GGML_OP_ROPE,
+        GGML_OP_ROPE_BACK,
+        GGML_OP_ALIBI,
+        GGML_OP_CLAMP,
+        GGML_OP_CONV_TRANSPOSE_1D,
+        GGML_OP_IM2COL,
+        GGML_OP_CONV_TRANSPOSE_2D,
+        GGML_OP_POOL_1D,
+        GGML_OP_POOL_2D,
+        GGML_OP_UPSCALE, // nearest interpolate
+        GGML_OP_PAD,
+        GGML_OP_ARGSORT,
+        GGML_OP_LEAKY_RELU,
+
+        GGML_OP_FLASH_ATTN,
+        GGML_OP_FLASH_FF,
+        GGML_OP_FLASH_ATTN_BACK,
+        GGML_OP_WIN_PART,
+        GGML_OP_WIN_UNPART,
+        GGML_OP_GET_REL_POS,
+        GGML_OP_ADD_REL_POS,
+
+        GGML_OP_UNARY,
+
+        GGML_OP_MAP_UNARY,
+        GGML_OP_MAP_BINARY,
+
+        GGML_OP_MAP_CUSTOM1_F32,
+        GGML_OP_MAP_CUSTOM2_F32,
+        GGML_OP_MAP_CUSTOM3_F32,
+
+        GGML_OP_MAP_CUSTOM1,
+        GGML_OP_MAP_CUSTOM2,
+        GGML_OP_MAP_CUSTOM3,
+
+        GGML_OP_CROSS_ENTROPY_LOSS,
+        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
+
+        GGML_OP_COUNT,
+    };
+
+    enum ggml_unary_op {
+        GGML_UNARY_OP_ABS,
+        GGML_UNARY_OP_SGN,
+        GGML_UNARY_OP_NEG,
+        GGML_UNARY_OP_STEP,
+        GGML_UNARY_OP_TANH,
+        GGML_UNARY_OP_ELU,
+        GGML_UNARY_OP_RELU,
+        GGML_UNARY_OP_GELU,
+        GGML_UNARY_OP_GELU_QUICK,
+        GGML_UNARY_OP_SILU,
+
+        GGML_UNARY_OP_COUNT,
+    };
+
+    enum ggml_object_type {
+        GGML_OBJECT_TENSOR,
+        GGML_OBJECT_GRAPH,
+        GGML_OBJECT_WORK_BUFFER
+    };
+
+    enum ggml_log_level {
+        GGML_LOG_LEVEL_ERROR = 2,
+        GGML_LOG_LEVEL_WARN = 3,
+        GGML_LOG_LEVEL_INFO = 4,
+        GGML_LOG_LEVEL_DEBUG = 5
+    };
+
+    // ggml object
+    struct ggml_object {
+        size_t offs;
+        size_t size;
+
+        struct ggml_object * next;
+
+        enum ggml_object_type type;
+
+        char padding[4];
+    };
+
+    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
+    // n-dimensional tensor
+    struct ggml_tensor {
+        enum ggml_type         type;
+        enum ggml_backend_type backend;
+
+        struct ggml_backend_buffer * buffer;
+
+        int64_t ne[GGML_MAX_DIMS]; // number of elements
+        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
+                                   // nb[0] = ggml_type_size(type)
+                                   // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
+                                   // nb[i] = nb[i-1] * ne[i-1]
+
+        // compute data
+        enum ggml_op op;
+
+        // op params - allocated as int32_t for alignment
+        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
+
+        bool is_param;
+
+        struct ggml_tensor * grad;
+        struct ggml_tensor * src[GGML_MAX_SRC];
+
+        // performance
+        int     perf_runs;
+        int64_t perf_cycles;
+        int64_t perf_time_us;
+
+        struct ggml_tensor * view_src;
+        size_t               view_offs;
+
+        void * data;
+
+        char name[GGML_MAX_NAME];
+
+        void * extra; // extra things e.g. for ggml-cuda.cu
+
+        char padding[8];
+    };
+
+    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
+
+    // the compute plan that needs to be prepared for ggml_graph_compute()
+    // since https://github.com/ggerganov/ggml/issues/287
+    struct ggml_cplan {
+        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
+        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
+
+        int n_threads;
+
+        // abort ggml_graph_compute when true
+        bool (*abort_callback)(void * data);
+        void * abort_callback_data;
+    };
+
+    enum ggml_cgraph_eval_order {
+        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+        GGML_CGRAPH_EVAL_ORDER_COUNT
+    };
+
+    struct ggml_hash_set {
+        size_t size;
+        struct ggml_tensor ** keys;
+    };
+
+    // computation graph
+    struct ggml_cgraph {
+        int size;
+        int n_nodes;
+        int n_leafs;
+
+        struct ggml_tensor ** nodes;
+        struct ggml_tensor ** grads;
+        struct ggml_tensor ** leafs;
+
+        struct ggml_hash_set visited_hash_table;
+
+        enum ggml_cgraph_eval_order order;
+
+        // performance
+        int     perf_runs;
+        int64_t perf_cycles;
+        int64_t perf_time_us;
+    };
+
+    // scratch buffer
+    struct ggml_scratch {
+        size_t offs;
+        size_t size;
+        void * data;
+    };
+
+    struct ggml_init_params {
+        // memory pool
+        size_t mem_size;   // bytes
+        void * mem_buffer; // if NULL, memory will be allocated internally
+        bool   no_alloc;   // don't allocate memory for the tensor data
+    };
+
+
+    // compute types
+
+    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
+    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
+    enum ggml_task_type {
+        GGML_TASK_INIT = 0,
+        GGML_TASK_COMPUTE,
+        GGML_TASK_FINALIZE,
+    };
+
+    struct ggml_compute_params {
+        enum ggml_task_type type;
+
+        // ith = thread index, nth = number of threads
+        int ith, nth;
+
+        // work buffer for all threads
+        size_t wsize;
+        void * wdata;
+    };
+
+    // misc
+
+    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
+    GGML_API int64_t ggml_time_ms(void);
+    GGML_API int64_t ggml_time_us(void);
+    GGML_API int64_t ggml_cycles(void);
+    GGML_API int64_t ggml_cycles_per_ms(void);
+
+    GGML_API void    ggml_print_backtrace(void);
+
+    GGML_API void    ggml_numa_init(void); // call once for better performance on NUMA systems
+    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
+
+    GGML_API void    ggml_print_object (const struct ggml_object * obj);
+    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
+
+    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
+    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
+
+    GGML_API int    ggml_blck_size(enum ggml_type type);
+    GGML_API size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
+    GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
+
+    GGML_DEPRECATED(
+    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
+    "use ggml_row_size() instead");
+
+    GGML_API const char * ggml_type_name(enum ggml_type type);
+    GGML_API const char * ggml_op_name  (enum ggml_op   op);
+    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
+
+    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
+    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
+
+    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
+
+    GGML_API bool    ggml_is_quantized(enum ggml_type type);
+
+    // TODO: temporary until model loading of ggml examples is refactored
+    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
+
+    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
+    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
+    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
+
+    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
+    // use this to compute the memory overhead of a tensor
+    GGML_API size_t ggml_tensor_overhead(void);
+
+    // main
+
+    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
+    GGML_API void                  ggml_free(struct ggml_context * ctx);
+
+    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
+
+    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
+    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+
+    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
+    GGML_API size_t  ggml_get_mem_size       (const struct ggml_context * ctx);
+    GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int    n_dims,
+            const int64_t *ne);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_1d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_2d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0,
+            int64_t ne1);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_3d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0,
+            int64_t ne1,
+            int64_t ne2);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor_4d(
+            struct ggml_context * ctx,
+            enum   ggml_type type,
+            int64_t ne0,
+            int64_t ne1,
+            int64_t ne2,
+            int64_t ne3);
+
+    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+
+    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
+
+    // Context tensor enumeration and lookup
+    GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
+    GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+
+    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+
+    // Converts a flat index into coordinates
+    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
+
+    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+
+    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
+
+    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+
+    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
+    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
+
+    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
+    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
+
+    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
+
+    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
+    GGML_ATTRIBUTE_FORMAT(2, 3)
+    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
+
+    //
+    // operations on tensors with backpropagation
+    //
+
+    GGML_API struct ggml_tensor * ggml_dup(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_dup_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_add(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_add_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_add_cast(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            enum   ggml_type      type);
+
+    GGML_API struct ggml_tensor * ggml_add1(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_add1_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // dst = a
+    // view(dst, nb1, nb2, nb3, offset) += b
+    // return dst
+    GGML_API struct ggml_tensor * ggml_acc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_acc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_sub(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_sub_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_mul(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_mul_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_div(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_div_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_sqr(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sqr_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sqrt(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sqrt_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_log(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_log_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // return scalar
+    GGML_API struct ggml_tensor * ggml_sum(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+    GGML_API struct ggml_tensor * ggml_sum_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // mean along rows
+    GGML_API struct ggml_tensor * ggml_mean(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // argmax along rows
+    GGML_API struct ggml_tensor * ggml_argmax(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // if a is the same shape as b, and a is not parameter, return a
+    // otherwise, return a new tensor: repeat(a) to fit in b
+    GGML_API struct ggml_tensor * ggml_repeat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // sums repetitions in a into shape of b
+    GGML_API struct ggml_tensor * ggml_repeat_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // concat a and b on dim 2
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_concat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_abs(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_abs_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sgn(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_sgn_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_neg(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_neg_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_step(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_step_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_tanh(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_tanh_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_elu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_elu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_relu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_leaky_relu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a, float negative_slope, bool inplace);
+
+    GGML_API struct ggml_tensor * ggml_relu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_gelu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_gelu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_gelu_quick(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_silu(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_silu_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_silu_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // normalize along rows
+    GGML_API struct ggml_tensor * ggml_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    GGML_API struct ggml_tensor * ggml_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    GGML_API struct ggml_tensor * ggml_rms_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 eps);
+
+    // group normalize along ne0*ne1*n_groups
+    // used in stable-diffusion
+    // TODO: eps is hardcoded to 1e-6 for now
+    GGML_API struct ggml_tensor * ggml_group_norm(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);
+
+    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_groups);
+
+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_rms_norm_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            float                 eps);
+
+    // A: k columns, n rows => [ne03, ne02, n, k]
+    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
+    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
+    GGML_API struct ggml_tensor * ggml_mul_mat(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // change the precision of a matrix multiplication
+    // set to GGML_PREC_F32 for higher precision (useful for phi-2)
+    GGML_API void ggml_mul_mat_set_prec(
+            struct ggml_tensor * a,
+            enum ggml_prec       prec);
+
+    // indirect matrix multiplication
+    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
+    GGML_API struct ggml_tensor * ggml_mul_mat_id(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * const as[],
+            int                   n_as,
+            struct ggml_tensor  * ids,
+            int                   id,
+            struct ggml_tensor  * b);
+
+    // A: m columns, n rows,
+    // B: p columns, n rows,
+    // result is m columns, p rows
+    GGML_API struct ggml_tensor * ggml_out_prod(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    //
+    // operations on tensors without backpropagation
+    //
+
+    GGML_API struct ggml_tensor * ggml_scale(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 s);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_scale_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 s);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // a -> b, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // a -> b, in-place, return view(b)
+    GGML_API struct ggml_tensor * ggml_cpy_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // make contiguous
+    GGML_API struct ggml_tensor * ggml_cont(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // make contiguous, in-place
+    GGML_API struct ggml_tensor * ggml_cont_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // make contiguous, with new shape
+    GGML_API struct ggml_tensor * ggml_cont_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
+    GGML_API struct ggml_tensor * ggml_cont_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+
+    GGML_API struct ggml_tensor * ggml_cont_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+
+    GGML_API struct ggml_tensor * ggml_cont_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
+    // return view(a), b specifies the new shape
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // return view(a)
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
+    GGML_API struct ggml_tensor * ggml_reshape_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1);
+
+    // return view(a)
+    // TODO: when we start computing gradient, make a copy instead of view
+    GGML_API struct ggml_tensor * ggml_reshape_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2);
+
+    GGML_API struct ggml_tensor * ggml_reshape_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
+    // offset in bytes
+    GGML_API struct ggml_tensor * ggml_view_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_view_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            size_t                nb1, // row stride in bytes
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_view_3d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_view_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_permute(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   axis0,
+            int                   axis1,
+            int                   axis2,
+            int                   axis3);
+
+    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
+    GGML_API struct ggml_tensor * ggml_transpose(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // supports 3D: a->ne[2] == b->ne[1]
+    GGML_API struct ggml_tensor * ggml_get_rows(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_get_rows_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c);
+
+    GGML_API struct ggml_tensor * ggml_diag(
+        struct ggml_context     * ctx,
+        struct ggml_tensor      * a);
+
+    // set elements above the diagonal to -INF
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // set elements above the diagonal to 0
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    GGML_API struct ggml_tensor * ggml_soft_max(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // fused soft_max(a*scale + mask)
+    // mask is optional
+    GGML_API struct ggml_tensor * ggml_soft_max_ext(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * mask,
+            float                 scale);
+
+    GGML_API struct ggml_tensor * ggml_soft_max_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // rotary position embedding
+    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
+    // if mode & 2 == 1, GPT-NeoX style
+    // if mode & 4 == 1, ChatGLM style
+    //
+    // b is an int32 vector with size a->ne[2], it contains the positions
+    GGML_API struct ggml_tensor * ggml_rope(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx);
+
+    // custom RoPE
+    GGML_API struct ggml_tensor * ggml_rope_custom(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            int                   n_orig_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            int                   n_orig_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow);
+
+    // compute correction dims for YaRN RoPE scaling
+    void ggml_rope_yarn_corr_dims(
+        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
+
+    // xPos RoPE, in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            float                 base,
+            bool                  down);
+
+    // rotary position embedding backward, i.e compute dx from dy
+    // a - dy
+    GGML_API struct ggml_tensor * ggml_rope_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   n_dims,
+            int                   mode,
+            int                   n_ctx,
+            int                   n_orig_ctx,
+            float                 freq_base,
+            float                 freq_scale,
+            float                 ext_factor,
+            float                 attn_factor,
+            float                 beta_fast,
+            float                 beta_slow,
+            float                 xpos_base,
+            bool                  xpos_down);
+
+    // alibi position embedding
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_alibi(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_head,
+            float                 bias_max);
+
+    // clamp
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_clamp(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            float                 min,
+            float                 max);
+
+    GGML_API struct ggml_tensor * ggml_im2col(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                  s0,
+            int                  s1,
+            int                  p0,
+            int                  p1,
+            int                  d0,
+            int                  d1,
+            bool                 is_2D);
+
+    GGML_API struct ggml_tensor * ggml_conv_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,  // stride
+            int                   p0,  // padding
+            int                   d0); // dilation
+
+    // conv_1d with padding = half
+    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
+    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s,
+            int                   d);
+
+    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   p0,
+            int                   d0);
+
+    GGML_API struct ggml_tensor * ggml_conv_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   s0,
+            int                   s1,
+            int                   p0,
+            int                   p1,
+            int                   d0,
+            int                   d1);
+
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is equal to kernel size
+    // padding is zero
+    // example:
+    // a:     16   16    3  768
+    // b:   1024 1024    3    1
+    // res:   64   64  768    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // kernel size is a->ne[0] x a->ne[1]
+    // stride is 1
+    // padding is half
+    // example:
+    // a:      3    3    256  256
+    // b:     64   64    256    1
+    // res:   64   64    256    1
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   stride);
+
+    enum ggml_op_pool {
+        GGML_OP_POOL_MAX,
+        GGML_OP_POOL_AVG,
+        GGML_OP_POOL_COUNT,
+    };
+
+    GGML_API struct ggml_tensor * ggml_pool_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0, // kernel size
+            int                   s0, // stride
+            int                   p0); // padding
+
+    // the result will have 2*p0 padding for the first dimension
+    // and 2*p1 padding for the second dimension
+    GGML_API struct ggml_tensor * ggml_pool_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_op_pool     op,
+            int                   k0,
+            int                   k1,
+            int                   s0,
+            int                   s1,
+            float                 p0,
+            float                 p1);
+
+    // nearest interpolate
+    // used in stable-diffusion
+    GGML_API struct ggml_tensor * ggml_upscale(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   scale_factor);
+
+    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
+    GGML_API struct ggml_tensor * ggml_pad(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                  p0,
+            int                  p1,
+            int                  p2,
+            int                  p3);
+
+    // sort rows
+    enum ggml_sort_order {
+        GGML_SORT_ASC,
+        GGML_SORT_DESC,
+    };
+
+    GGML_API struct ggml_tensor * ggml_argsort(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            enum ggml_sort_order  order);
+
+    // top k elements per row
+    GGML_API struct ggml_tensor * ggml_top_k(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   k);
+
+    GGML_API struct ggml_tensor * ggml_flash_attn(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * q,
+            struct ggml_tensor  * k,
+            struct ggml_tensor  * v,
+            bool                  masked);
+
+    GGML_API struct ggml_tensor * ggml_flash_attn_back(
+           struct ggml_context * ctx,
+           struct ggml_tensor  * q,
+           struct ggml_tensor  * k,
+           struct ggml_tensor  * v,
+           struct ggml_tensor  * d,
+           bool                  masked);
+
+    GGML_API struct ggml_tensor * ggml_flash_ff(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b0,
+            struct ggml_tensor  * b1,
+            struct ggml_tensor  * c0,
+            struct ggml_tensor  * c1);
+
+    // partition into non-overlapping windows with padding if needed
+    // example:
+    // a:   768   64   64    1
+    // w:    14
+    // res: 768   14   14    25
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_win_part(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   w);
+
+    // reverse of ggml_win_part
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_win_unpart(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   w0,
+            int                   h0,
+            int                   w);
+
+    GGML_API struct ggml_tensor * ggml_unary(
+            struct ggml_context * ctx,
+             struct ggml_tensor * a,
+             enum ggml_unary_op op);
+
+    GGML_API struct ggml_tensor * ggml_unary_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        enum ggml_unary_op op);
+
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_get_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   qh,
+            int                   kh);
+
+    // used in sam
+    GGML_API struct ggml_tensor * ggml_add_rel_pos(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
+    GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * pw,
+            struct ggml_tensor  * ph);
+
+    // custom operators
+
+    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
+    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
+
+    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
+            struct ggml_context        * ctx,
+            struct ggml_tensor         * a,
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
+            struct ggml_context        * ctx,
+            struct ggml_tensor         * a,
+                   ggml_unary_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+                   ggml_binary_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+                   ggml_custom1_op_f32_t   fun),
+        "use ggml_map_custom1_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+                   ggml_custom2_op_f32_t   fun),
+        "use ggml_map_custom2_inplace instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+            struct ggml_tensor           * c,
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3 instead");
+
+    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
+            struct ggml_context          * ctx,
+            struct ggml_tensor           * a,
+            struct ggml_tensor           * b,
+            struct ggml_tensor           * c,
+                   ggml_custom3_op_f32_t   fun),
+        "use ggml_map_custom3_inplace instead");
+
+    // custom operators v2
+
+    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
+    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
+
+    #define GGML_N_TASKS_MAX -1
+
+    GGML_API struct ggml_tensor * ggml_map_custom1(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            ggml_custom1_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            ggml_custom2_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
+            struct ggml_context   * ctx,
+            struct ggml_tensor    * a,
+            struct ggml_tensor    * b,
+            struct ggml_tensor    * c,
+            ggml_custom3_op_t       fun,
+            int                     n_tasks,
+            void                  * userdata);
+
+    // loss function
+
+    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b);
+
+    GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
+            struct ggml_context         * ctx,
+            struct ggml_tensor          * a,
+            struct ggml_tensor          * b,
+            struct ggml_tensor          * c);
+
+    //
+    // automatic differentiation
+    //
+
+    GGML_API void ggml_set_param(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * tensor);
+
+
+    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
+
+    // graph allocation in a context
+    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
+    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
+    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
+    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
+
+    GGML_API size_t ggml_graph_overhead(void);
+    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
+
+    // ggml_graph_plan() has to be called before ggml_graph_compute()
+    // when plan.work_size > 0, caller must allocate memory for plan.work_data
+    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
+    GGML_API int               ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
+
+    // same as ggml_graph_compute() but the work data is allocated as a part of the context
+    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
+    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
+
+    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+
+    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
+
+    // print info and performance information for the graph
+    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
+
+    // dump the graph into a file using the dot format
+    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+
+    // build gradient checkpointing backward graph gb for gf using provided checkpoints
+    // gb_tmp will contain original backward graph with rewritten backward process nodes,
+    // but without the second forward pass nodes.
+    GGML_API void ggml_build_backward_gradient_checkpointing(
+            struct ggml_context   * ctx,
+            struct ggml_cgraph    * gf,
+            struct ggml_cgraph    * gb,
+            struct ggml_cgraph    * gb_tmp,
+            struct ggml_tensor  * * checkpoints,
+            int                     n_checkpoints);
+    //
+    // optimization
+    //
+
+    // optimization methods
+    enum ggml_opt_type {
+        GGML_OPT_ADAM,
+        GGML_OPT_LBFGS,
+    };
+
+    // linesearch methods
+    enum ggml_linesearch {
+        GGML_LINESEARCH_DEFAULT = 1,
+
+        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
+        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
+        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
+    };
+
+    // optimization return values
+    enum ggml_opt_result {
+        GGML_OPT_OK = 0,
+        GGML_OPT_DID_NOT_CONVERGE,
+        GGML_OPT_NO_CONTEXT,
+        GGML_OPT_INVALID_WOLFE,
+        GGML_OPT_FAIL,
+        GGML_OPT_CANCEL,
+
+        GGML_LINESEARCH_FAIL = -128,
+        GGML_LINESEARCH_MINIMUM_STEP,
+        GGML_LINESEARCH_MAXIMUM_STEP,
+        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
+        GGML_LINESEARCH_INVALID_PARAMETERS,
+    };
+
+    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
+    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
+
+    // optimization parameters
+    //
+    //   see ggml.c (ggml_opt_default_params) for default values
+    //
+    struct ggml_opt_params {
+        enum ggml_opt_type type;
+
+        size_t graph_size;
+
+        int n_threads;
+
+        // delta-based convergence test
+        //
+        //   if past == 0 - disabled
+        //   if past > 0:
+        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
+        //
+        int past;
+        float delta;
+
+        // maximum number of iterations without improvement
+        //
+        //   if 0 - disabled
+        //   if > 0:
+        //     assume convergence if no cost improvement in this number of iterations
+        //
+        int max_no_improvement;
+
+        bool print_forward_graph;
+        bool print_backward_graph;
+
+        int n_gradient_accumulation;
+
+        // ADAM parameters
+        struct {
+            int n_iter;
+
+            float sched; // schedule multiplier (fixed, decay or warmup)
+            float decay; // weight decay for AdamW, use 0.0f to disable
+            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
+            float alpha; // learning rate
+            float beta1;
+            float beta2;
+            float eps;   // epsilon for numerical stability
+            float eps_f; // epsilon for convergence test
+            float eps_g; // epsilon for convergence test
+            float gclip; // gradient clipping
+        } adam;
+
+        // LBFGS parameters
+        struct {
+            int m; // number of corrections to approximate the inv. Hessian
+            int n_iter;
+            int max_linesearch;
+
+            float eps;      // convergence tolerance
+            float ftol;     // line search tolerance
+            float wolfe;
+            float min_step;
+            float max_step;
+
+            enum ggml_linesearch linesearch;
+        } lbfgs;
+    };
+
+    struct ggml_opt_context {
+        struct ggml_context * ctx;
+        struct ggml_opt_params params;
+
+        int iter;
+        int64_t nx; // number of parameter elements
+
+        bool just_initialized;
+
+        float loss_before;
+        float loss_after;
+
+        struct {
+            struct ggml_tensor * g;  // current gradient
+            struct ggml_tensor * m;  // first moment
+            struct ggml_tensor * v;  // second moment
+            struct ggml_tensor * pf; // past function values
+            float fx_best;
+            float fx_prev;
+            int n_no_improvement;
+        } adam;
+
+        struct {
+            struct ggml_tensor * x;    // current parameters
+            struct ggml_tensor * xp;   // previous parameters
+            struct ggml_tensor * g;    // current gradient
+            struct ggml_tensor * gp;   // previous gradient
+            struct ggml_tensor * d;    // search direction
+            struct ggml_tensor * pf;   // past function values
+            struct ggml_tensor * lmal; // the L-BFGS memory alpha
+            struct ggml_tensor * lmys; // the L-BFGS memory ys
+            struct ggml_tensor * lms;  // the L-BFGS memory s
+            struct ggml_tensor * lmy;  // the L-BFGS memory y
+            float fx_best;
+            float step;
+            int j;
+            int k;
+            int end;
+            int n_no_improvement;
+        } lbfgs;
+    };
+
+    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+
+    // optimize the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt(
+            struct ggml_context * ctx,
+            struct ggml_opt_params params,
+            struct ggml_tensor * f);
+
+    // initialize optimizer context
+    GGML_API void ggml_opt_init(
+            struct ggml_context     * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_opt_params    params,
+            int64_t                   nx);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f);
+
+    // continue optimizing the function defined by the tensor f
+    GGML_API enum ggml_opt_result ggml_opt_resume_g(
+            struct ggml_context * ctx,
+            struct ggml_opt_context * opt,
+            struct ggml_tensor * f,
+            struct ggml_cgraph * gf,
+            struct ggml_cgraph * gb,
+            ggml_opt_callback callback,
+            void * callback_data);
+
+    //
+    // quantization
+    //
+
+    // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
+    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
+
+    GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
+
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
+
+    //
+    // gguf
+    //
+
+    enum gguf_type {
+        GGUF_TYPE_UINT8   = 0,
+        GGUF_TYPE_INT8    = 1,
+        GGUF_TYPE_UINT16  = 2,
+        GGUF_TYPE_INT16   = 3,
+        GGUF_TYPE_UINT32  = 4,
+        GGUF_TYPE_INT32   = 5,
+        GGUF_TYPE_FLOAT32 = 6,
+        GGUF_TYPE_BOOL    = 7,
+        GGUF_TYPE_STRING  = 8,
+        GGUF_TYPE_ARRAY   = 9,
+        GGUF_TYPE_UINT64  = 10,
+        GGUF_TYPE_INT64   = 11,
+        GGUF_TYPE_FLOAT64 = 12,
+        GGUF_TYPE_COUNT,       // marks the end of the enum
+    };
+
+    struct gguf_context;
+
+    struct gguf_init_params {
+        bool no_alloc;
+
+        // if not NULL, create a ggml_context and allocate the tensor data in it
+        struct ggml_context ** ctx;
+    };
+
+    GGML_API struct gguf_context * gguf_init_empty(void);
+    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
+    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
+
+    GGML_API void gguf_free(struct gguf_context * ctx);
+
+    GGML_API const char * gguf_type_name(enum gguf_type type);
+
+    GGML_API int    gguf_get_version    (const struct gguf_context * ctx);
+    GGML_API size_t gguf_get_alignment  (const struct gguf_context * ctx);
+    GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
+    GGML_API void * gguf_get_data       (const struct gguf_context * ctx);
+
+    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
+    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
+    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
+
+    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
+    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
+
+    // will abort if the wrong type is used for the key
+    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int key_id);
+    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int key_id);
+    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
+    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
+    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
+    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
+    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
+    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
+    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
+    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
+    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
+    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
+    GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
+    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
+    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
+    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
+
+    GGML_API int            gguf_get_n_tensors    (const struct gguf_context * ctx);
+    GGML_API int            gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
+    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
+    GGML_API char *         gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
+    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int i);
+
+    // overrides existing values or adds a new one
+    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
+    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
+    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
+    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
+    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
+    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
+    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
+    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
+    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
+    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
+    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
+    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
+    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
+    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
+
+    // set or add KV pairs from another context
+    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
+
+    // manage tensor info
+    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
+    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
+    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
+
+    // writing gguf files can be done in 2 ways:
+    //
+    // - write the entire gguf_context to a binary file in a single pass:
+    //
+    //   gguf_write_to_file(ctx, fname);
+    //
+    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
+    //
+    //   FILE * f = fopen(fname, "wb");
+    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
+    //   fwrite(f, ...);
+    //   void * data = gguf_meta_get_meta_data(ctx);
+    //   fseek(f, 0, SEEK_SET);
+    //   fwrite(f, data, gguf_get_meta_size(ctx));
+    //   free(data);
+    //   fclose(f);
+    //
+
+    // write the entire context to a binary file
+    GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
+
+    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
+    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
+    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
+
+    //
+    // system info
+    //
+
+    GGML_API int ggml_cpu_has_avx        (void);
+    GGML_API int ggml_cpu_has_avx2       (void);
+    GGML_API int ggml_cpu_has_avx512     (void);
+    GGML_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_API int ggml_cpu_has_fma        (void);
+    GGML_API int ggml_cpu_has_neon       (void);
+    GGML_API int ggml_cpu_has_arm_fma    (void);
+    GGML_API int ggml_cpu_has_metal      (void);
+    GGML_API int ggml_cpu_has_f16c       (void);
+    GGML_API int ggml_cpu_has_fp16_va    (void);
+    GGML_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_API int ggml_cpu_has_blas       (void);
+    GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_clblast    (void);
+    GGML_API int ggml_cpu_has_gpublas    (void);
+    GGML_API int ggml_cpu_has_sse3       (void);
+    GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_vsx        (void);
+
+    //
+    // Internal types and functions exposed for tests and benchmarks
+    //
+
+#ifdef  __cplusplus
+// restrict not standard in C++
+#define GGML_RESTRICT
+#else
+#define GGML_RESTRICT restrict
+#endif
+    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
+    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
+    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
+
+    typedef struct dpct_type_994041 {
+        const char      * type_name;
+        int               blck_size;
+        size_t            type_size;
+        bool              is_quantized;
+        ggml_to_float_t   to_float;
+        ggml_from_float_t from_float;
+        ggml_from_float_t from_float_reference;
+        ggml_vec_dot_t    vec_dot;
+        enum ggml_type    vec_dot_type;
+    } ggml_type_traits_t;
+
+    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/dpcpp_out2/ggml.h.yaml b/dpcpp_out2/ggml.h.yaml
new file mode 100644
index 0000000000000..47d52a213f243
--- /dev/null
+++ b/dpcpp_out2/ggml.h.yaml
@@ -0,0 +1,100 @@
+---
+MainSourceFile:  '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/dpcpp_out2/ggml.h'
+Replacements:
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h'
+    Offset:          7458
+    Length:          0
+    ReplacementText: "#define DPCT_PROFILING_ENABLED\n#include <sycl/sycl.hpp>\n#include <dpct/dpct.hpp>\n"
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h'
+    Offset:          10556
+    Length:          10
+    ReplacementText: SYCL_LANGUAGE_VERSION
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h'
+    Offset:          82284
+    Length:          0
+    ReplacementText: ' dpct_type_994041'
+    ConstantFlag:    ''
+    ConstantOffset:  0
+    InitStr:         ''
+    NewHostVarName:  ''
+    BlockLevelFormatFlag: false
+MainSourceFilesDigest:
+  - MainSourceFile:  '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h'
+    Digest:          a7f88ed7f3bbff01c9713ad58f5dac5b
+DpctVersion:     18.0.0
+MainHelperFileName: ''
+USMLevel:        ''
+FeatureMap:      {}
+CompileTargets:  {}
+OptionMap:
+  AnalysisScopePath:
+    Value:           '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub'
+    Specified:       false
+  AsyncHandler:
+    Value:           'false'
+    Specified:       false
+  CommentsEnabled:
+    Value:           'false'
+    Specified:       false
+  CompilationsDir:
+    Value:           ''
+    Specified:       false
+  CtadEnabled:
+    Value:           'false'
+    Specified:       false
+  EnablepProfiling:
+    Value:           'true'
+    Specified:       true
+  ExperimentalFlag:
+    Value:           '0'
+    Specified:       false
+  ExplicitClNamespace:
+    Value:           'false'
+    Specified:       false
+  ExplicitNamespace:
+    Value:           '20'
+    Specified:       false
+  ExtensionDDFlag:
+    Value:           '0'
+    Specified:       false
+  ExtensionDEFlag:
+    Value:           '4294967295'
+    Specified:       false
+  HelperFuncPreferenceFlag:
+    Value:           '0'
+    Specified:       false
+  NDRangeDim:
+    Value:           '3'
+    Specified:       false
+  NoDRYPattern:
+    Value:           'false'
+    Specified:       false
+  NoUseGenericSpace:
+    Value:           ''
+    Specified:       true
+  OptimizeMigration:
+    Value:           'false'
+    Specified:       false
+  ProcessAll:
+    Value:           'false'
+    Specified:       false
+  RuleFile:
+    Value:           ''
+    Specified:       false
+  SyclNamedLambda:
+    Value:           'false'
+    Specified:       false
+  UsmLevel:
+    Value:           '1'
+    Specified:       false
+...
diff --git a/dpct/atomic.hpp b/dpct/atomic.hpp
new file mode 100644
index 0000000000000..4b516f5304023
--- /dev/null
+++ b/dpct/atomic.hpp
@@ -0,0 +1,842 @@
+//==---- atomic.hpp -------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_ATOMIC_HPP__
+#define __DPCT_ATOMIC_HPP__
+
+#include <sycl/sycl.hpp>
+
+namespace dpct {
+
+/// Atomically add the value operand to the value at the addr and assign the
+/// result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to add to the value at \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline T atomic_fetch_add(T *addr, T operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_add(operand);
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2>
+inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_add(operand);
+}
+
+/// Atomically add the value operand to the value at the addr and assign the
+/// result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to add to the value at \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+inline T atomic_fetch_add(T *addr, T operand,
+                          sycl::memory_order memoryOrder) {
+  switch (memoryOrder) {
+  case sycl::memory_order::relaxed:
+    return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::acq_rel:
+    return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::seq_cst:
+    return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
+                            sycl::memory_scope::device>(addr, operand);
+  default:
+    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                    "atomics are: sycl::memory_order::relaxed, "
+                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+  }
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          typename T1, typename T2>
+inline T1 atomic_fetch_add(T1 *addr, T2 operand,
+                           sycl::memory_order memoryOrder) {
+  atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
+}
+
+/// Atomically subtract the value operand from the value at the addr and assign
+/// the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to subtract from the value at \p addr
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline T atomic_fetch_sub(T *addr, T operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_sub(operand);
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2>
+inline T1 atomic_fetch_sub(T1 *addr, T2 operand) {
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_sub(operand);
+}
+
+/// Atomically subtract the value operand from the value at the addr and assign
+/// the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to subtract from the value at \p addr
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+inline T atomic_fetch_sub(T *addr, T operand,
+                          sycl::memory_order memoryOrder) {
+  switch (memoryOrder) {
+  case sycl::memory_order::relaxed:
+    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::relaxed,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::acq_rel:
+    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::acq_rel,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::seq_cst:
+    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::seq_cst,
+                            sycl::memory_scope::device>(addr, operand);
+  default:
+    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                    "atomics are: sycl::memory_order::relaxed, "
+                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+  }
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          typename T1, typename T2>
+inline T1 atomic_fetch_sub(T1 *addr, T2 operand,
+                           sycl::memory_order memoryOrder) {
+  atomic_fetch_sub<T1, addressSpace>(addr, operand, memoryOrder);
+}
+
+/// Atomically perform a bitwise AND between the value operand and the value at the addr
+/// and assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline T atomic_fetch_and(T *addr, T operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_and(operand);
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2>
+inline T1 atomic_fetch_and(T1 *addr, T2 operand) {
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_and(operand);
+}
+
+/// Atomically perform a bitwise AND between the value operand and the value at the addr
+/// and assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+inline T atomic_fetch_and(T *addr, T operand,
+                          sycl::memory_order memoryOrder) {
+  switch (memoryOrder) {
+  case sycl::memory_order::relaxed:
+    return atomic_fetch_and<T, addressSpace, sycl::memory_order::relaxed,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::acq_rel:
+    return atomic_fetch_and<T, addressSpace, sycl::memory_order::acq_rel,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::seq_cst:
+    return atomic_fetch_and<T, addressSpace, sycl::memory_order::seq_cst,
+                            sycl::memory_scope::device>(addr, operand);
+  default:
+    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                    "atomics are: sycl::memory_order::relaxed, "
+                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+  }
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          typename T1, typename T2>
+inline T1 atomic_fetch_and(T1 *addr, T2 operand,
+                           sycl::memory_order memoryOrder) {
+  atomic_fetch_and<T1, addressSpace>(addr, operand, memoryOrder);
+}
+
+/// Atomically or the value at the addr with the value operand, and assign
+/// the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline T atomic_fetch_or(T *addr, T operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_or(operand);
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2>
+inline T1 atomic_fetch_or(T1 *addr, T2 operand) {
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_or(operand);
+}
+
+/// Atomically or the value at the addr with the value operand, and assign
+/// the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+inline T atomic_fetch_or(T *addr, T operand,
+                         sycl::memory_order memoryOrder) {
+  switch (memoryOrder) {
+  case sycl::memory_order::relaxed:
+    return atomic_fetch_or<T, addressSpace, sycl::memory_order::relaxed,
+                           sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::acq_rel:
+    return atomic_fetch_or<T, addressSpace, sycl::memory_order::acq_rel,
+                           sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::seq_cst:
+    return atomic_fetch_or<T, addressSpace, sycl::memory_order::seq_cst,
+                           sycl::memory_scope::device>(addr, operand);
+  default:
+    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                    "atomics are: sycl::memory_order::relaxed, "
+                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+  }
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          typename T1, typename T2>
+inline T1 atomic_fetch_or(T1 *addr, T2 operand,
+                           sycl::memory_order memoryOrder) {
+  atomic_fetch_or<T1, addressSpace>(addr, operand, memoryOrder);
+}
+
+/// Atomically xor the value at the addr with the value operand, and assign
+/// the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline T atomic_fetch_xor(T *addr, T operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_xor(operand);
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2>
+inline T1 atomic_fetch_xor(T1 *addr, T2 operand) {
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_xor(operand);
+}
+
+/// Atomically xor the value at the addr with the value operand, and assign
+/// the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+inline T atomic_fetch_xor(T *addr, T operand,
+                          sycl::memory_order memoryOrder) {
+  switch (memoryOrder) {
+  case sycl::memory_order::relaxed:
+    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::relaxed,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::acq_rel:
+    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::acq_rel,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::seq_cst:
+    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::seq_cst,
+                            sycl::memory_scope::device>(addr, operand);
+  default:
+    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                    "atomics are: sycl::memory_order::relaxed, "
+                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+  }
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          typename T1, typename T2>
+inline T1 atomic_fetch_xor(T1 *addr, T2 operand,
+                           sycl::memory_order memoryOrder) {
+  atomic_fetch_xor<T1, addressSpace>(addr, operand, memoryOrder);
+}
+
+/// Atomically calculate the minimum of the value at addr and the value operand
+/// and assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline T atomic_fetch_min(T *addr, T operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_min(operand);
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2>
+inline T1 atomic_fetch_min(T1 *addr, T2 operand) {
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_min(operand);
+}
+
+/// Atomically calculate the minimum of the value at addr and the value operand
+/// and assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+inline T atomic_fetch_min(T *addr, T operand,
+                          sycl::memory_order memoryOrder) {
+  switch (memoryOrder) {
+  case sycl::memory_order::relaxed:
+    return atomic_fetch_min<T, addressSpace, sycl::memory_order::relaxed,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::acq_rel:
+    return atomic_fetch_min<T, addressSpace, sycl::memory_order::acq_rel,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::seq_cst:
+    return atomic_fetch_min<T, addressSpace, sycl::memory_order::seq_cst,
+                            sycl::memory_scope::device>(addr, operand);
+  default:
+    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                    "atomics are: sycl::memory_order::relaxed, "
+                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+  }
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          typename T1, typename T2>
+inline T1 atomic_fetch_min(T1 *addr, T2 operand,
+                           sycl::memory_order memoryOrder) {
+  atomic_fetch_min<T1, addressSpace>(addr, operand, memoryOrder);
+}
+
+/// Atomically calculate the maximum of the value at addr and the value operand
+/// and assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline T atomic_fetch_max(T *addr, T operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_max(operand);
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2>
+inline T1 atomic_fetch_max(T1 *addr, T2 operand) {
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.fetch_max(operand);
+}
+
+/// Atomically calculate the maximum of the value at addr and the value operand
+/// and assign the result to the value at addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+inline T atomic_fetch_max(T *addr, T operand,
+                          sycl::memory_order memoryOrder) {
+  switch (memoryOrder) {
+  case sycl::memory_order::relaxed:
+    return atomic_fetch_max<T, addressSpace, sycl::memory_order::relaxed,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::acq_rel:
+    return atomic_fetch_max<T, addressSpace, sycl::memory_order::acq_rel,
+                            sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::seq_cst:
+    return atomic_fetch_max<T, addressSpace, sycl::memory_order::seq_cst,
+                            sycl::memory_scope::device>(addr, operand);
+  default:
+    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                    "atomics are: sycl::memory_order::relaxed, "
+                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+  }
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          typename T1, typename T2>
+inline T1 atomic_fetch_max(T1 *addr, T2 operand,
+                           sycl::memory_order memoryOrder) {
+  atomic_fetch_max<T1, addressSpace>(addr, operand, memoryOrder);
+}
+
+/// Atomically set \p operand to the value stored in \p addr, if old value stored in
+/// \p addr is equal to zero or greater than \p operand, else decrease the value stored
+/// in \p addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The threshold value.
+/// \param memoryOrder The memory ordering used.
+/// \returns The old value stored in \p addr.
+template <sycl::access::address_space addressSpace = sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline unsigned int atomic_fetch_compare_dec(unsigned int *addr,
+                                             unsigned int operand) {
+  auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
+                                  addressSpace>(addr[0]);
+  unsigned int old;
+
+	while (true) {
+	  old = atm.load();
+	  if (old == 0 || old > operand) {
+		  if (atm.compare_exchange_strong(old, operand))
+        break;
+	  } else if (atm.compare_exchange_strong(old, old - 1))
+      break;
+	}
+
+  return old;
+}
+
+/// Atomically increment the value stored in \p addr if old value stored in \p
+/// addr is less than \p operand, else set 0 to the value stored in \p addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The threshold value.
+/// \param memoryOrder The memory ordering used.
+/// \returns The old value stored in \p addr.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
+                                             unsigned int operand) {
+  auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
+                                  addressSpace>(addr[0]);
+  unsigned int old;
+  while (true) {
+    old = atm.load();
+    if (old >= operand) {
+      if (atm.compare_exchange_strong(old, 0))
+        break;
+    } else if (atm.compare_exchange_strong(old, old + 1))
+      break;
+  }
+  return old;
+}
+
+/// Atomically increment the value stored in \p addr if old value stored in \p
+/// addr is less than \p operand, else set 0 to the value stored in \p addr.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The threshold value.
+/// \param memoryOrder The memory ordering used.
+/// \returns The old value stored in \p addr.
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space>
+inline unsigned int
+atomic_fetch_compare_inc(unsigned int *addr, unsigned int operand,
+                         sycl::memory_order memoryOrder) {
+  switch (memoryOrder) {
+  case sycl::memory_order::relaxed:
+    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::relaxed,
+                                    sycl::memory_scope::device>(addr,
+                                                                   operand);
+  case sycl::memory_order::acq_rel:
+    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::acq_rel,
+                                    sycl::memory_scope::device>(addr,
+                                                                   operand);
+  case sycl::memory_order::seq_cst:
+    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::seq_cst,
+                                    sycl::memory_scope::device>(addr,
+                                                                   operand);
+  default:
+    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                    "atomics are: sycl::memory_order::relaxed, "
+                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+  }
+}
+
+/// Atomically exchange the value at the address addr with the value operand.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to be exchanged with the value pointed by \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+inline T atomic_exchange(T *addr, T operand) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.exchange(operand);
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2>
+inline T1 atomic_exchange(T1 *addr, T2 operand) {
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  return atm.exchange(operand);
+}
+
+/// Atomically exchange the value at the address addr with the value operand.
+/// \param [in, out] addr The pointer to the data.
+/// \param operand The value to be exchanged with the value pointed by \p addr.
+/// \param memoryOrder The memory ordering used.
+/// \returns The value at the \p addr before the call.
+template <typename T, sycl::access::address_space addressSpace =
+                          sycl::access::address_space::global_space>
+inline T atomic_exchange(T *addr, T operand,
+                         sycl::memory_order memoryOrder) {
+  switch (memoryOrder) {
+  case sycl::memory_order::relaxed:
+    return atomic_exchange<T, addressSpace, sycl::memory_order::relaxed,
+                           sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::acq_rel:
+    return atomic_exchange<T, addressSpace, sycl::memory_order::acq_rel,
+                           sycl::memory_scope::device>(addr, operand);
+  case sycl::memory_order::seq_cst:
+    return atomic_exchange<T, addressSpace, sycl::memory_order::seq_cst,
+                           sycl::memory_scope::device>(addr, operand);
+  default:
+    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
+                    "atomics are: sycl::memory_order::relaxed, "
+                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
+  }
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          typename T1, typename T2>
+inline T1 atomic_exchange(T1 *addr, T2 operand,
+                           sycl::memory_order memoryOrder) {
+  atomic_exchange<T1, addressSpace>(addr, operand, memoryOrder);
+}
+
+/// Atomically compare the value at \p addr to the value expected and exchange
+/// with the value desired if the value at \p addr is equal to the value expected.
+/// Returns the value at the \p addr before the call.
+/// \param [in, out] addr Multi_ptr.
+/// \param expected The value to compare against the value at \p addr.
+/// \param desired The value to assign to \p addr if the value at \p addr is expected.
+/// \param success The memory ordering used when comparison succeeds.
+/// \param fail The memory ordering used when comparison fails.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_compare_exchange_strong(
+    sycl::multi_ptr<T, addressSpace> addr, T expected, T desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail = sycl::memory_order::relaxed) {
+  auto atm = sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(*addr);
+
+  atm.compare_exchange_strong(expected, desired, success, fail);
+  return expected;
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2, typename T3>
+T1 atomic_compare_exchange_strong(
+    sycl::multi_ptr<T1, addressSpace> addr, T2 expected, T3 desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail = sycl::memory_order::relaxed) {
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(*addr);
+  T1 expected_value = expected;
+  atm.compare_exchange_strong(expected_value, desired, success, fail);
+  return expected_value;
+}
+
+/// Atomically compare the value at \p addr to the value expected and exchange
+/// with the value desired if the value at \p addr is equal to the value expected.
+/// Returns the value at the \p addr before the call.
+/// \param [in] addr The pointer to the data.
+/// \param expected The value to compare against the value at \p addr.
+/// \param desired The value to assign to \p addr if the value at \p addr is expected.
+/// \param success The memory ordering used when comparison succeeds.
+/// \param fail The memory ordering used when comparison fails.
+/// \returns The value at the \p addr before the call.
+template <typename T,
+          sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device>
+T atomic_compare_exchange_strong(
+    T *addr, T expected, T desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail = sycl::memory_order::relaxed) {
+  auto atm =
+      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  atm.compare_exchange_strong(expected, desired, success, fail);
+  return expected;
+}
+
+template <sycl::access::address_space addressSpace =
+              sycl::access::address_space::global_space,
+          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
+          sycl::memory_scope memoryScope = sycl::memory_scope::device,
+          typename T1, typename T2, typename T3>
+T1 atomic_compare_exchange_strong(
+    T1 *addr, T2 expected, T3 desired,
+    sycl::memory_order success = sycl::memory_order::relaxed,
+    sycl::memory_order fail = sycl::memory_order::relaxed) {
+  T1 expected_value = expected;
+  auto atm =
+      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
+  atm.compare_exchange_strong(expected_value, desired, success, fail);
+  return expected_value;
+}
+
+/// Atomic extension to implement standard APIs in std::atomic
+namespace detail{
+template <typename T> struct IsValidAtomicType {
+  static constexpr bool value =
+      (std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
+       std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
+       std::is_same<T, long long>::value ||
+       std::is_same<T, unsigned long long>::value ||
+       std::is_same<T, float>::value || std::is_same<T, double>::value ||
+       std::is_pointer<T>::value);
+};
+} // namespace detail
+
+template <typename T,
+          sycl::memory_scope DefaultScope = sycl::memory_scope::system,
+          sycl::memory_order DefaultOrder = sycl::memory_order::seq_cst,
+          sycl::access::address_space Space =
+              sycl::access::address_space::generic_space>
+class atomic{
+  static_assert(
+    detail::IsValidAtomicType<T>::value,
+    "Invalid atomic type.  Valid types are int, unsigned int, long, "
+      "unsigned long, long long, unsigned long long, float, double "
+      "and pointer types");
+  T __d;
+
+public:
+  /// default memory synchronization order
+  static constexpr sycl::memory_order default_read_order =
+      sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_read_order;
+  static constexpr sycl::memory_order default_write_order =
+      sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_write_order;
+  static constexpr sycl::memory_scope default_scope = DefaultScope;
+  static constexpr sycl::memory_order default_read_modify_write_order =
+      DefaultOrder;
+  
+
+  /// Default constructor.
+  constexpr atomic() noexcept = default;
+  /// Constructor with initialize value.
+  constexpr atomic(T d) noexcept : __d(d){};
+
+  /// atomically replaces the value of the referenced object with a non-atomic argument
+  /// \param operand The value to replace the pointed value.
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  void store(T operand, sycl::memory_order memoryOrder = default_write_order,
+             sycl::memory_scope memoryScope = default_scope) noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    atm.store(operand, memoryOrder, memoryScope);
+  }
+
+  /// atomically obtains the value of the referenced object
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  /// \returns The value of the referenced object
+  T load(sycl::memory_order memoryOrder = default_read_order,
+         sycl::memory_scope memoryScope = default_scope) const noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(
+      const_cast<T &>(__d));
+    return atm.load(memoryOrder, memoryScope);
+  }
+
+  /// atomically replaces the value of the referenced object and obtains the value held previously
+  /// \param operand The value to replace the pointed value.
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  /// \returns The value of the referenced object before the call.
+  T exchange(T operand,
+             sycl::memory_order memoryOrder = default_read_modify_write_order,
+             sycl::memory_scope memoryScope = default_scope) noexcept {
+
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.exchange(operand, memoryOrder, memoryScope);
+  }
+
+  /// atomically compares the value of the referenced object with non-atomic argument 
+  /// and performs atomic exchange if equal or atomic load if not
+  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
+  /// \param desired  The value to store in the referenced object if it is as expected
+  /// \param success The memory models for the read-modify-write
+  /// \param failure The memory models for load operations
+  /// \param memoryScope The memory scope used.
+  /// \returns true if the referenced object was successfully changed, false otherwise.
+  bool compare_exchange_weak(
+      T &expected, T desired,
+      sycl::memory_order success, sycl::memory_order failure,
+      sycl::memory_scope memoryScope = default_scope) noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.compare_exchange_weak(expected, desired, success, failure, memoryScope);
+  }
+  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
+  /// \param desired  The value to store in the referenced object if it is as expected
+  /// \param memoryOrder 	The memory synchronization ordering for operations
+  /// \param memoryScope The memory scope used.
+  /// \returns true if the referenced object was successfully changed, false otherwise.
+  bool compare_exchange_weak(T &expected, T desired,
+                  sycl::memory_order memoryOrder = default_read_modify_write_order,
+                  sycl::memory_scope memoryScope = default_scope) noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.compare_exchange_weak(expected, desired, memoryOrder, memoryScope);
+  }
+
+  /// atomically compares the value of the referenced object with non-atomic argument 
+  /// and performs atomic exchange if equal or atomic load if not
+  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
+  /// \param desired  The value to store in the referenced object if it is as expected
+  /// \param success The memory models for the read-modify-write
+  /// \param failure The memory models for load operations
+  /// \param memoryScope The memory scope used.
+  /// \returns true if the referenced object was successfully changed, false otherwise.
+  bool compare_exchange_strong(
+      T &expected, T desired,
+      sycl::memory_order success, sycl::memory_order failure,
+      sycl::memory_scope memoryScope = default_scope) noexcept {
+
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.compare_exchange_strong(expected, desired, success, failure, memoryScope);
+  }
+  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
+  /// \param desired  The value to store in the referenced object if it is as expected
+  /// \param memoryOrder 	The memory synchronization ordering for operations
+  /// \param memoryScope The memory scope used.
+  /// \returns true if the referenced object was successfully changed, false otherwise.
+  bool compare_exchange_strong(T &expected, T desired,
+                    sycl::memory_order memoryOrder = default_read_modify_write_order,
+                    sycl::memory_scope memoryScope = default_scope) noexcept {
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.compare_exchange_strong(expected, desired, memoryOrder, memoryScope);
+  }
+
+  /// atomically adds the argument to the value stored in the atomic object and obtains the value held previously
+  /// \param operand 	The other argument of arithmetic addition
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  /// \returns The value of the referenced object before the call.
+  T fetch_add(T operand,
+              sycl::memory_order memoryOrder = default_read_modify_write_order,
+              sycl::memory_scope  memoryScope = default_scope) noexcept {
+
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.fetch_add(operand, memoryOrder,  memoryScope);
+  }
+
+  /// atomically subtracts the argument from the value stored in the atomic object and obtains the value held previously
+  /// \param operand 	The other argument of arithmetic subtraction
+  /// \param memoryOrder The memory ordering used.
+  /// \param memoryScope The memory scope used.
+  /// \returns The value of the referenced object before the call.
+  T fetch_sub(T operand,
+              sycl::memory_order memoryOrder = default_read_modify_write_order,
+              sycl::memory_scope memoryScope = default_scope) noexcept {
+
+    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
+    return atm.fetch_sub(operand, memoryOrder, memoryScope);
+  }
+};
+
+} // namespace dpct
+#endif // __DPCT_ATOMIC_HPP__
diff --git a/dpct/blas_utils.hpp b/dpct/blas_utils.hpp
new file mode 100644
index 0000000000000..df222c528bc08
--- /dev/null
+++ b/dpct/blas_utils.hpp
@@ -0,0 +1,1792 @@
+//==---- blas_utils.hpp----------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_BLAS_UTILS_HPP__
+#define __DPCT_BLAS_UTILS_HPP__
+
+#include "memory.hpp"
+#include "util.hpp"
+#include "lib_common_utils.hpp"
+#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#include <utility>
+#include <vector>
+#include <thread>
+
+namespace dpct {
+
+/// Get the value of \p s.
+/// Copy the data to host synchronously, then return the data.
+/// \param [in] p The pointer points the data.
+/// \param [in] q The queue where the memory copy should be executed.
+template <typename T>
+inline auto get_value(const T *s, sycl::queue &q) {
+  return detail::get_value(s, q);
+}
+
+namespace detail {
+inline void mem_free(sycl::queue *exec_queue,
+                     std::vector<void *> pointers_array, sycl::event e) {
+  e.wait();
+  for (auto p : pointers_array)
+    sycl::free(p, *exec_queue);
+}
+
+inline int stride_for(int num_elems, int mem_align_in_elems) {
+  return ((num_elems - 1) / mem_align_in_elems + 1) * mem_align_in_elems;
+}
+
+#ifndef DPCT_USM_LEVEL_NONE
+template<typename T>
+class working_memory {
+  T *_input_ptr;
+  T *_temp_ptr;
+  bool _is_sycl_malloced = false;
+  bool _is_scalar_value = false;
+  sycl::queue _q;
+  sycl::event _e;
+
+public:
+  working_memory(size_t size, sycl::queue q) : _q(q) {
+    _is_scalar_value = false;
+    _temp_ptr = (T *)sycl::malloc_device(size, q);
+  }
+  working_memory(T *result_ptr, sycl::queue q) : _input_ptr(result_ptr), _q(q) {
+    _is_scalar_value = true;
+    _is_sycl_malloced = sycl::get_pointer_type(_input_ptr, _q.get_context()) !=
+                        sycl::usm::alloc::unknown;
+    if (!_is_sycl_malloced)
+      _temp_ptr = sycl::malloc_shared<T>(1, _q);
+  }
+  auto get_ptr() {
+    if (_is_scalar_value && _is_sycl_malloced)
+      return _input_ptr;
+    return _temp_ptr;
+  }
+  void set_event(sycl::event e) { _e = e; }
+  ~working_memory() {
+    if (_is_scalar_value) {
+      if (!_is_sycl_malloced) {
+        _q.memcpy(_input_ptr, _temp_ptr, sizeof(T)).wait();
+        sycl::free(_temp_ptr, _q);
+      }
+    } else {
+      std::vector<void *> ptrs{_temp_ptr};
+      dpct::async_dpct_free(ptrs, {_e});
+    }
+  }
+};
+#endif
+
+template <typename Tx, typename Tr>
+inline void nrm2_impl(sycl::queue &q, int n, const void *x, int incx,
+                         void *result) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+#ifdef DPCT_USM_LEVEL_NONE
+  auto x_buffer = dpct::get_buffer<Tx>(x);
+  auto r_buffer =
+      sycl::buffer<Tr, 1>(reinterpret_cast<Tr *>(result), sycl::range<1>(1));
+  if (dpct::is_device_ptr(result))
+    r_buffer = dpct::get_buffer<Tr>(result);
+  oneapi::mkl::blas::column_major::nrm2(q, n, x_buffer, incx, r_buffer);
+#else
+  working_memory<Tr> res_mem(reinterpret_cast<Tr *>(result), q);
+  oneapi::mkl::blas::column_major::nrm2(q, n, reinterpret_cast<const Tx *>(x),
+                                        incx, res_mem.get_ptr());
+#endif
+#endif
+}
+
+template <bool is_conjugate, class Txy, class Tr>
+inline void dotuc_impl(sycl::queue &q, int n, const Txy *x, int incx,
+                          const Txy *y, int incy, Tr *result) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+#ifdef DPCT_USM_LEVEL_NONE
+  auto x_buffer = dpct::get_buffer<Txy>(x);
+  auto y_buffer = dpct::get_buffer<Txy>(y);
+  auto r_buffer = sycl::buffer<Tr, 1>((Tr *)result, sycl::range<1>(1));
+  if (dpct::is_device_ptr(result))
+    r_buffer = dpct::get_buffer<Tr>(result);
+  if constexpr (std::is_same_v<Txy, std::complex<float>> ||
+                std::is_same_v<Txy, std::complex<double>>) {
+    if constexpr (is_conjugate)
+      oneapi::mkl::blas::column_major::dotc(q, n, x_buffer, incx, y_buffer,
+                                            incy, r_buffer);
+    else
+      oneapi::mkl::blas::column_major::dotu(q, n, x_buffer, incx, y_buffer,
+                                            incy, r_buffer);
+  } else
+    oneapi::mkl::blas::column_major::dot(q, n, x_buffer, incx, y_buffer, incy,
+                                         r_buffer);
+#else
+  working_memory<Tr> res_mem(result, q);
+  if constexpr (std::is_same_v<Txy, std::complex<float>> ||
+                std::is_same_v<Txy, std::complex<double>>) {
+    if constexpr (is_conjugate)
+      oneapi::mkl::blas::column_major::dotc(q, n, x, incx, y, incy, res_mem.get_ptr());
+    else
+      oneapi::mkl::blas::column_major::dotu(q, n, x, incx, y, incy, res_mem.get_ptr());
+  } else
+    oneapi::mkl::blas::column_major::dot(q, n, x, incx, y, incy, res_mem.get_ptr());
+#endif
+#endif
+}
+
+template <bool is_conjugate>
+inline void dotuc(sycl::queue &q, int n, const void *x,
+                     library_data_t x_type, int incx, const void *y,
+                     library_data_t y_type, int incy, void *result,
+                     library_data_t result_type) {
+  std::uint64_t key = detail::get_type_combination_id(x_type, y_type, result_type);
+  switch (key) {
+  case detail::get_type_combination_id(library_data_t::real_float, library_data_t::real_float,
+                       library_data_t::real_float): {
+    detail::dotuc_impl<is_conjugate>(
+        q, n, reinterpret_cast<const float *>(x), incx,
+        reinterpret_cast<const float *>(y), incy,
+        reinterpret_cast<float *>(result));
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_double, library_data_t::real_double,
+                       library_data_t::real_double): {
+    detail::dotuc_impl<is_conjugate>(
+        q, n, reinterpret_cast<const double *>(x), incx,
+        reinterpret_cast<const double *>(y), incy,
+        reinterpret_cast<double *>(result));
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_float,
+                       library_data_t::complex_float,
+                       library_data_t::complex_float): {
+    detail::dotuc_impl<is_conjugate>(
+        q, n, reinterpret_cast<const std::complex<float> *>(x), incx,
+        reinterpret_cast<const std::complex<float> *>(y), incy,
+        reinterpret_cast<std::complex<float> *>(result));
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_double,
+                       library_data_t::complex_double,
+                       library_data_t::complex_double): {
+    detail::dotuc_impl<is_conjugate>(
+        q, n, reinterpret_cast<const std::complex<double> *>(x), incx,
+        reinterpret_cast<const std::complex<double> *>(y), incy,
+        reinterpret_cast<std::complex<double> *>(result));
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_half, library_data_t::real_half,
+                       library_data_t::real_half): {
+    detail::dotuc_impl<is_conjugate>(
+        q, n, reinterpret_cast<const sycl::half *>(x), incx,
+        reinterpret_cast<const sycl::half *>(y), incy,
+        reinterpret_cast<sycl::half *>(result));
+    break;
+  }
+  default:
+    throw std::runtime_error("the combination of data type is unsupported");
+  }
+}
+
+template <class Tx, class Te>
+inline void scal_impl(sycl::queue &q, int n, const void *alpha, void *x,
+                         int incx) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+  Te alpha_val = dpct::get_value(reinterpret_cast<const Te *>(alpha), q);
+  auto data_x = get_memory<Tx>(x);
+  oneapi::mkl::blas::column_major::scal(q, n, alpha_val,
+                                        data_x, incx);
+#endif
+}
+
+template <class Txy, class Te>
+inline void axpy_impl(sycl::queue &q, int n, const void *alpha, const void *x,
+                        int incx, void *y, int incy) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+  Te alpha_val = dpct::get_value(reinterpret_cast<const Te *>(alpha), q);
+  auto data_x = get_memory<const Txy>(x);
+  auto data_y = get_memory<Txy>(y);
+  oneapi::mkl::blas::column_major::axpy(q, n, alpha_val,
+                                        data_x, incx,
+                                        data_y, incy);
+#endif
+}
+
+template <class Txy, class Tc, class Ts>
+inline void rot_impl(sycl::queue &q, int n, void *x, int incx, void *y,
+                        int incy, const void *c, const void *s) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+  Tc c_value = dpct::get_value(reinterpret_cast<const Tc *>(c), q);
+  Ts s_value = dpct::get_value(reinterpret_cast<const Ts *>(s), q);
+  auto data_x = get_memory<Txy>(x);
+  auto data_y = get_memory<Txy>(y);
+  oneapi::mkl::blas::column_major::rot(q, n, data_x, incx,
+                                       data_y, incy, c_value,
+                                       s_value);
+#endif
+}
+
+template <class Ta, class Tb, class Tc, class Ts>
+inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                         oneapi::mkl::transpose b_trans, int m, int n, int k,
+                         const void *alpha, const void *a, int lda, const void *b,
+                         int ldb, const void *beta, void *c, int ldc) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+  Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+  Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+  auto data_a = get_memory<const Ta>(a);
+  auto data_b = get_memory<const Tb>(b);
+  auto data_c = get_memory<Tc>(c);
+  oneapi::mkl::blas::column_major::gemm(
+      q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+      data_b, ldb, beta_value, data_c, ldc);
+#endif
+}
+
+template <class Ta, class Tb, class Tc, class Ts>
+inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                            oneapi::mkl::transpose b_trans, int m, int n, int k,
+                            const void *alpha, const void **a, int lda,
+                            const void **b, int ldb, const void *beta, void **c,
+                            int ldc, int batch_size) {
+  struct matrix_info_t {
+    oneapi::mkl::transpose transpose_info[2];
+    Ts value_info[2];
+    std::int64_t size_info[3];
+    std::int64_t ld_info[3];
+    std::int64_t groupsize_info;
+  };
+
+  Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+  Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+
+  matrix_info_t *matrix_info =
+      (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
+  matrix_info->transpose_info[0] = a_trans;
+  matrix_info->transpose_info[1] = b_trans;
+  matrix_info->value_info[0] = alpha_value;
+  matrix_info->value_info[1] = beta_value;
+  matrix_info->size_info[0] = m;
+  matrix_info->size_info[1] = n;
+  matrix_info->size_info[2] = k;
+  matrix_info->ld_info[0] = lda;
+  matrix_info->ld_info[1] = ldb;
+  matrix_info->ld_info[2] = ldc;
+  matrix_info->groupsize_info = batch_size;
+
+  sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
+      q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
+      matrix_info->size_info, matrix_info->size_info + 1,
+      matrix_info->size_info + 2, matrix_info->value_info,
+      reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
+      reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
+      matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
+      matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
+
+  q.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(e);
+    cgh.host_task([=] { std::free(matrix_info); });
+  });
+}
+
+template <class Ta, class Tb, class Tc, class Ts>
+inline void
+gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                    oneapi::mkl::transpose b_trans, int m, int n,
+                    int k, const void *alpha, const void *a, int lda,
+                    long long int stride_a, const void *b, int ldb,
+                    long long int stride_b, const void *beta, void *c,
+                    int ldc, long long int stride_c, int batch_size) {
+  Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+  Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+  auto data_a = get_memory<const Ta>(a);
+  auto data_b = get_memory<const Tb>(b);
+  auto data_c = get_memory<Tc>(c);
+  oneapi::mkl::blas::column_major::gemm_batch(
+      q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+      stride_a, data_b, ldb, stride_b, beta_value,
+      data_c, ldc, stride_c, batch_size);
+}
+
+template <bool is_hermitian, class T, class Tbeta>
+inline void rk_impl(sycl::queue &q, oneapi::mkl::uplo uplo,
+                          oneapi::mkl::transpose trans, int n, int k,
+                          const T *alpha, const T *a, int lda, const T *b,
+                          int ldb, const Tbeta *beta, T *c, int ldc) {
+  // For symmetric matrix, this function performs: C = alpha*OP(A)*(OP(B))^T + beta*C
+  // For Hermitian matrix, this function performs: C = alpha*OP(A)*(OP(B))^H + beta*C
+  // The gemmt() function performs: C = alpha*OPA(A)*OPB(B) + beta*C
+  // So the OPB need be updated before we call gemmt().
+  using Ty = typename dpct::DataType<T>::T2;
+  using Ts = typename dpct::DataType<Tbeta>::T2;
+  Ty alpha_value = dpct::get_value(reinterpret_cast<const Ty *>(alpha), q);
+  Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+  oneapi::mkl::transpose trans_A = trans, trans_B = trans;
+  int origin_b_rows = trans == oneapi::mkl::transpose::nontrans ? n : k;
+  int origin_b_cols = trans == oneapi::mkl::transpose::nontrans ? k : n;
+
+  if ((is_hermitian && trans == oneapi::mkl::transpose::trans) ||
+      (!is_hermitian && !std::is_floating_point_v<Ty> && trans == oneapi::mkl::transpose::conjtrans)) {
+    // In this case, OPB need be a conjugate operation,
+    // but only notrans, conjtrans and trans are available.
+    // So we need do a conjtrans operation first, then do a trans operation.
+    trans_B = oneapi::mkl::transpose::trans;
+    auto data_a = get_memory<const Ty>(a);
+    auto data_c = get_memory<Ty>(c);
+#ifdef DPCT_USM_LEVEL_NONE
+    auto new_B_buffer = sycl::buffer<Ty, 1>(sycl::range<1>(origin_b_rows * origin_b_cols));
+    auto from_buffer = dpct::get_buffer<Ty>(b);
+    oneapi::mkl::blas::column_major::omatcopy_batch(
+          q, oneapi::mkl::transpose::conjtrans, origin_b_rows, origin_b_cols,
+          Ts(1.0), from_buffer, ldb, origin_b_rows * ldb, new_B_buffer,
+          origin_b_cols, origin_b_rows * origin_b_cols, 1);
+    oneapi::mkl::blas::column_major::gemmt(
+        q, uplo, trans_A, trans_B, n, k, alpha_value,
+        data_a, lda, new_B_buffer, origin_b_cols, beta_value, data_c, ldc);
+#else
+    working_memory<T> new_B(origin_b_rows * origin_b_cols * sizeof(T), q);
+    oneapi::mkl::blas::column_major::omatcopy_batch(
+        q, oneapi::mkl::transpose::conjtrans, origin_b_rows, origin_b_cols,
+        Ts(1.0), reinterpret_cast<const Ty *>(b), ldb, origin_b_rows * ldb,
+        reinterpret_cast<Ty *>(new_B.get_ptr()), origin_b_cols,
+        origin_b_rows * origin_b_cols, 1);
+    sycl::event e = oneapi::mkl::blas::column_major::gemmt(
+        q, uplo, trans_A, trans_B, n, k, alpha_value,
+        data_a, lda, reinterpret_cast<Ty *>(new_B.get_ptr()), origin_b_cols,
+        beta_value, data_c, ldc);
+    new_B.set_event(e);
+#endif
+  } else {
+    if constexpr (is_hermitian) {
+      trans_B = trans == oneapi::mkl::transpose::nontrans
+                  ? oneapi::mkl::transpose::conjtrans
+                  : oneapi::mkl::transpose::nontrans;
+    } else {
+      trans_B = trans == oneapi::mkl::transpose::nontrans
+                  ? oneapi::mkl::transpose::trans
+                  : oneapi::mkl::transpose::nontrans;
+    }
+    auto data_a = get_memory<const Ty>(a);
+    auto data_b = get_memory<const Ty>(b);
+    auto data_c = get_memory<Ty>(c);
+    oneapi::mkl::blas::column_major::gemmt(
+        q, uplo, trans_A, trans_B, n, k, alpha_value,
+        data_a, lda, data_b, ldb, beta_value, data_c, ldc);
+  }
+}
+
+template <class Ta, class Tb, class Ts>
+inline void
+trsm_batch_impl(sycl::queue &q, oneapi::mkl::side left_right,
+                oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
+                oneapi::mkl::diag unit_diag, int m, int n, const void *alpha,
+                const void **a, int lda, void **b, int ldb, int batch_size) {
+  struct matrix_info_t {
+    matrix_info_t(oneapi::mkl::side side_info, oneapi::mkl::uplo uplo_info,
+                  oneapi::mkl::transpose transpose_info,
+                  oneapi::mkl::diag diag_info, Ts value_info, std::int64_t m,
+                  std::int64_t n, std::int64_t lda, std::int64_t ldb,
+                  std::int64_t groupsize_info)
+        : side_info(side_info), uplo_info(uplo_info),
+          transpose_info(transpose_info), diag_info(diag_info),
+          value_info(value_info), groupsize_info(groupsize_info) {
+      size_info[0] = m;
+      size_info[1] = n;
+      ld_info[0] = lda;
+      ld_info[1] = ldb;
+    }
+    oneapi::mkl::side side_info;
+    oneapi::mkl::uplo uplo_info;
+    oneapi::mkl::transpose transpose_info;
+    oneapi::mkl::diag diag_info;
+    Ts value_info;
+    std::int64_t size_info[2];
+    std::int64_t ld_info[2];
+    std::int64_t groupsize_info;
+  };
+
+  Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+
+  matrix_info_t *matrix_info =
+      new matrix_info_t(left_right, upper_lower, trans, unit_diag, alpha_value,
+                        m, n, lda, ldb, batch_size);
+
+  sycl::event e = oneapi::mkl::blas::column_major::trsm_batch(
+      q, &(matrix_info->side_info), &(matrix_info->uplo_info),
+      &(matrix_info->transpose_info), &(matrix_info->diag_info),
+      matrix_info->size_info, matrix_info->size_info + 1,
+      &(matrix_info->value_info), reinterpret_cast<const Ta **>(a),
+      matrix_info->ld_info, reinterpret_cast<Tb **>(b),
+      matrix_info->ld_info + 1, 1, &(matrix_info->groupsize_info));
+
+  q.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(e);
+    cgh.host_task([=] { delete matrix_info; });
+  });
+}
+
+template <typename T>
+inline void getrfnp_batch_wrapper(sycl::queue &exec_queue, int n, T *a[],
+                                  int lda, int *info, int batch_size) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+  using Ty = typename DataType<T>::T2;
+  // Set the info array value to 0
+  detail::dpct_memset<unsigned char>(exec_queue, info, 0, sizeof(int) * batch_size);
+  std::int64_t stride_a = n * lda;
+  std::int64_t scratchpad_size =
+      oneapi::mkl::lapack::getrfnp_batch_scratchpad_size<Ty>(
+          exec_queue, n, n, lda, stride_a, batch_size);
+
+  Ty *a_strided_mem =
+      (Ty *)dpct::dpct_malloc(stride_a * batch_size * sizeof(Ty), exec_queue);
+  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
+  dpct::dpct_memcpy(host_a, a, batch_size * sizeof(T *));
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    dpct::dpct_memcpy(a_strided_mem + i * stride_a, host_a[i],
+                      n * lda * sizeof(T));
+
+#ifdef DPCT_USM_LEVEL_NONE
+  {
+    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
+    auto a_buffer = get_buffer<Ty>(a_strided_mem);
+    oneapi::mkl::lapack::getrfnp_batch(exec_queue, n, n, a_buffer, lda,
+                                       stride_a, batch_size, scratchpad,
+                                       scratchpad_size);
+  }
+  std::vector<sycl::event> events;
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    events.push_back(detail::dpct_memcpy(exec_queue, host_a[i],
+                                         a_strided_mem + i * stride_a,
+                                         n * lda * sizeof(T), automatic));
+#else
+  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
+  sycl::event e = oneapi::mkl::lapack::getrfnp_batch(
+      exec_queue, n, n, a_strided_mem, lda, stride_a, batch_size, scratchpad,
+      scratchpad_size);
+  std::vector<sycl::event> events;
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    events.push_back(detail::dpct_memcpy(exec_queue, host_a[i],
+                                         a_strided_mem + i * stride_a,
+                                         n * lda * sizeof(T), automatic, {e}));
+
+  std::vector<void *> ptrs{scratchpad, a_strided_mem};
+  dpct::async_dpct_free(ptrs, events, exec_queue);
+#endif
+
+  exec_queue.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(events);
+    cgh.host_task([=] { std::free(host_a); });
+  });
+#endif
+}
+
+} // namespace detail
+
+inline oneapi::mkl::transpose get_transpose(int t) {
+  if (t == 0) {
+    return oneapi::mkl::transpose::nontrans;
+  } else if (t == 1) {
+    return oneapi::mkl::transpose::trans;
+  } else {
+    return oneapi::mkl::transpose::conjtrans;
+  }
+}
+
+/// Computes the LU factorizations of a batch of general matrices.
+/// \param [in] exec_queue The queue where the routine should be executed.
+/// \param [in] n The order of the matrices.
+/// \param [in, out] a Array of pointers to matrices. These matrices will be
+/// overwritten by lower triangulars with unit diagonal elements and upper
+/// triangulars.
+/// \param [in] lda The leading dimension of the matrices.
+/// \param [out] ipiv An array stores the pivot indices. If \p ipiv is nullptr,
+/// non-pivoting LU factorization is computed.
+/// \param [out] info An array stores the error information.
+/// \param [in] batch_size The size of the batch.
+template <typename T>
+inline void getrf_batch_wrapper(sycl::queue &exec_queue, int n, T *a[],
+                                int lda, int *ipiv, int *info, int batch_size) {
+  if (ipiv == nullptr) {
+    detail::getrfnp_batch_wrapper(exec_queue, n, a, lda, info, batch_size);
+    return;
+  }
+  using Ty = typename DataType<T>::T2;
+  // Set the info array value to 0
+  detail::dpct_memset<unsigned char>(exec_queue, info, 0, sizeof(int) * batch_size);
+#ifdef DPCT_USM_LEVEL_NONE
+  std::int64_t stride_a = n * lda;
+  std::int64_t stride_ipiv = n;
+  std::int64_t scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<Ty>(
+      exec_queue, n, n, lda, stride_a, stride_ipiv, batch_size);
+
+  T *a_buffer_ptr;
+  a_buffer_ptr = (T *)dpct_malloc(stride_a * batch_size * sizeof(T));
+
+  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
+  dpct_memcpy(host_a, a, batch_size * sizeof(T *));
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    dpct_memcpy(a_buffer_ptr + i * stride_a, host_a[i], n * lda * sizeof(T));
+
+  {
+    sycl::buffer<std::int64_t, 1> ipiv_buf(
+        sycl::range<1>(batch_size * stride_ipiv));
+    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
+    auto a_buffer = get_buffer<Ty>(a_buffer_ptr);
+    oneapi::mkl::lapack::getrf_batch(exec_queue, n, n, a_buffer, lda, stride_a,
+                             ipiv_buf, stride_ipiv, batch_size, scratchpad,
+                             scratchpad_size);
+
+    auto to_buffer = get_buffer<int>(ipiv);
+    exec_queue.submit([&](sycl::handler &cgh) {
+      auto from_acc = ipiv_buf.get_access<sycl::access_mode::read>(cgh);
+      auto to_acc = to_buffer.get_access<sycl::access_mode::write>(cgh);
+      cgh.parallel_for<dpct_kernel_name<class getrf_device_int64_to_int, T>>(
+          sycl::range<2>(batch_size, n), [=](sycl::id<2> id) {
+            to_acc[id.get(0) * n + id.get(1)] =
+                static_cast<int>(from_acc[id.get(0) * stride_ipiv + id.get(1)]);
+          });
+    });
+  }
+
+  // Copy back to the original buffers
+  std::vector<sycl::event> events;
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    events.push_back(detail::dpct_memcpy(exec_queue, host_a[i],
+                                         a_buffer_ptr + i * stride_a,
+                                         n * lda * sizeof(T), automatic));
+
+  std::vector<void *> ptrs{host_a};
+  std::thread mem_free_thread(
+      [=](std::vector<void *> pointers_array,
+          std::vector<sycl::event> events_array) {
+        sycl::event::wait(events_array);
+        for (auto p : pointers_array)
+          std::free(p);
+      },
+      ptrs, events);
+  mem_free_thread.detach();
+#else
+  std::int64_t m_int64 = n;
+  std::int64_t n_int64 = n;
+  std::int64_t lda_int64 = lda;
+  std::int64_t group_sizes = batch_size;
+  std::int64_t scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<Ty>(
+      exec_queue, &m_int64, &n_int64, &lda_int64, 1, &group_sizes);
+
+  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
+  std::int64_t *ipiv_int64 =
+      sycl::malloc_device<std::int64_t>(batch_size * n, exec_queue);
+  std::int64_t **ipiv_int64_ptr =
+      sycl::malloc_shared<std::int64_t *>(batch_size, exec_queue);
+  T **a_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
+  exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *)).wait();
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    ipiv_int64_ptr[i] = ipiv_int64 + n * i;
+
+  oneapi::mkl::lapack::getrf_batch(exec_queue, &m_int64, &n_int64, (Ty **)a_shared, &lda_int64,
+                           ipiv_int64_ptr, 1, &group_sizes, scratchpad,
+                           scratchpad_size);
+
+  sycl::event e = exec_queue.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<dpct_kernel_name<class getrf_device_int64_to_int, T>>(
+        sycl::range<1>(batch_size * n), [=](sycl::id<1> idx) {
+          ipiv[idx] = static_cast<int>(ipiv_int64[idx]);
+        });
+  });
+
+  std::vector<void *> ptrs{scratchpad, ipiv_int64, ipiv_int64_ptr, a_shared};
+  async_dpct_free(ptrs, {e}, exec_queue);
+#endif
+}
+
+/// Solves a system of linear equations with a batch of LU-factored square
+/// coefficient matrices, with multiple right-hand sides.
+/// \param [in] exec_queue The queue where the routine should be executed.
+/// \param [in] trans Indicates the form of the linear equations.
+/// \param [in] n The order of the matrices.
+/// \param [in] nrhs The number of right hand sides.
+/// \param [in] a Array of pointers to matrices.
+/// \param [in] lda The leading dimension of the matrices in \p a.
+/// \param [in] ipiv An array stores the pivots.
+/// \param [in, out] b Array of pointers to matrices, whose columns are
+/// the right-hand sides for the systems of equations.
+/// \param [in] ldb The leading dimension of the matrices in \p b.
+/// \param [out] info A value stores the error information.
+/// \param [in] batch_size The size of the batch.
+template <typename T>
+inline void getrs_batch_wrapper(sycl::queue &exec_queue,
+                                oneapi::mkl::transpose trans, int n, int nrhs,
+                                const T *a[], int lda, const int *ipiv, T *b[],
+                                int ldb, int *info, int batch_size) {
+  using Ty = typename DataType<T>::T2;
+  // Set the info value to 0
+  *info = 0;
+#ifdef DPCT_USM_LEVEL_NONE
+  std::int64_t stride_a = n * lda;
+  std::int64_t stride_b = nrhs * ldb;
+  std::int64_t stride_ipiv = n;
+  std::int64_t scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<Ty>(
+      exec_queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b,
+      batch_size);
+
+  T *a_buffer_ptr, *b_buffer_ptr;
+  a_buffer_ptr = (T *)dpct_malloc(stride_a * batch_size * sizeof(T));
+  b_buffer_ptr = (T *)dpct_malloc(stride_b * batch_size * sizeof(T));
+
+  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
+  T **host_b = (T **)std::malloc(batch_size * sizeof(T *));
+  dpct_memcpy(host_a, a, batch_size * sizeof(T *));
+  dpct_memcpy(host_b, b, batch_size * sizeof(T *));
+  for (std::int64_t i = 0; i < batch_size; ++i) {
+    dpct_memcpy(a_buffer_ptr + i * stride_a, host_a[i], n * lda * sizeof(T));
+    dpct_memcpy(b_buffer_ptr + i * stride_b, host_b[i], nrhs * ldb * sizeof(T));
+  }
+
+  {
+    auto a_buffer = get_buffer<Ty>(a_buffer_ptr);
+    auto b_buffer = get_buffer<Ty>(b_buffer_ptr);
+    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
+    sycl::buffer<std::int64_t, 1> ipiv_buf(
+        sycl::range<1>(batch_size * stride_ipiv));
+    auto from_buf = get_buffer<int>(ipiv);
+    exec_queue.submit([&](sycl::handler &cgh) {
+      auto from_acc = from_buf.get_access<sycl::access_mode::read>(cgh);
+      auto to_acc = ipiv_buf.get_access<sycl::access_mode::write>(cgh);
+      cgh.parallel_for<dpct_kernel_name<class getrs_device_int64_to_int, T>>(
+          sycl::range<2>(batch_size, n), [=](sycl::id<2> id) {
+            to_acc[id.get(0) * stride_ipiv + id.get(1)] =
+                static_cast<std::int64_t>(from_acc[id.get(0) * n + id.get(1)]);
+          });
+    });
+
+    oneapi::mkl::lapack::getrs_batch(exec_queue, trans, n, nrhs, a_buffer, lda,
+                             stride_a, ipiv_buf, stride_ipiv, b_buffer, ldb,
+                             stride_b, batch_size, scratchpad, scratchpad_size);
+  }
+
+  // Copy back to the original buffers
+  std::vector<sycl::event> events;
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    events.push_back(detail::dpct_memcpy(exec_queue, host_b[i],
+                                         b_buffer_ptr + i * stride_b,
+                                         nrhs * ldb * sizeof(T), automatic));
+  std::vector<void *> ptrs{host_a, host_b};
+  std::thread mem_free_thread(
+      [=](std::vector<void *> pointers_array,
+          std::vector<sycl::event> events_array) {
+        sycl::event::wait(events_array);
+        for (auto p : pointers_array)
+          std::free(p);
+      },
+      ptrs, events);
+  mem_free_thread.detach();
+#else
+  std::int64_t n_int64 = n;
+  std::int64_t nrhs_int64 = nrhs;
+  std::int64_t lda_int64 = lda;
+  std::int64_t ldb_int64 = ldb;
+  std::int64_t group_sizes = batch_size;
+  std::int64_t scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<Ty>(
+      exec_queue, &trans, &n_int64, &nrhs_int64, &lda_int64, &ldb_int64, 1,
+      &group_sizes);
+
+  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
+  std::int64_t *ipiv_int64 =
+      sycl::malloc_device<std::int64_t>(batch_size * n, exec_queue);
+  std::int64_t **ipiv_int64_ptr =
+      sycl::malloc_shared<std::int64_t *>(batch_size, exec_queue);
+  T **a_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
+  T **b_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
+  exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *));
+  exec_queue.memcpy(b_shared, b, batch_size * sizeof(T *));
+
+  exec_queue.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<dpct_kernel_name<class getrs_device_int64_to_int, T>>(
+        sycl::range<1>(batch_size * n), [=](sycl::id<1> idx) {
+          ipiv_int64[idx] = static_cast<std::int64_t>(ipiv[idx]);
+        });
+  }).wait();
+
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    ipiv_int64_ptr[i] = ipiv_int64 + n * i;
+
+  sycl::event e = oneapi::mkl::lapack::getrs_batch(
+      exec_queue, &trans, &n_int64, &nrhs_int64, (Ty **)a_shared, &lda_int64,
+      ipiv_int64_ptr, (Ty **)b_shared, &ldb_int64, 1, &group_sizes, scratchpad,
+      scratchpad_size);
+
+  std::vector<void *> ptrs{scratchpad, ipiv_int64_ptr, ipiv_int64, a_shared, b_shared};
+  async_dpct_free(ptrs, {e}, exec_queue);
+#endif
+}
+
+/// Computes the inverses of a batch of LU-factored matrices.
+/// \param [in] exec_queue The queue where the routine should be executed.
+/// \param [in] n The order of the matrices.
+/// \param [in] a Array of pointers to matrices.
+/// \param [in] lda The leading dimension of the matrices in \p a.
+/// \param [in] ipiv An array stores the pivots.
+/// \param [out] b Array of pointers to inverse matrices.
+/// \param [in] ldb The leading dimension of the matrices in \p b.
+/// \param [out] info An array stores the error information.
+/// \param [in] batch_size The size of the batch.
+template <typename T>
+inline void getri_batch_wrapper(sycl::queue &exec_queue, int n,
+                                const T *a[], int lda, int *ipiv, T *b[],
+                                int ldb, int *info, int batch_size) {
+  using Ty = typename DataType<T>::T2;
+  // Set the info array value to 0
+  detail::dpct_memset<unsigned char>(exec_queue, info, 0, sizeof(int) * batch_size);
+#ifdef DPCT_USM_LEVEL_NONE
+  std::int64_t stride_b = n * ldb;
+  std::int64_t stride_ipiv = n;
+  std::int64_t scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<Ty>(
+      exec_queue, n, ldb, stride_b, stride_ipiv, batch_size);
+
+  T *b_buffer_ptr;
+  b_buffer_ptr = (T *)dpct_malloc(stride_b * batch_size * sizeof(T));
+
+  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
+  T **host_b = (T **)std::malloc(batch_size * sizeof(T *));
+  dpct_memcpy(host_a, a, batch_size * sizeof(T *));
+  dpct_memcpy(host_b, b, batch_size * sizeof(T *));
+
+  for (std::int64_t i = 0; i < batch_size; ++i) {
+    // Need to create a copy of input matrices "a" to keep them unchanged.
+    // Matrices "b" (copy of matrices "a") will be used as input and output
+    // parameter in oneapi::mkl::lapack::getri_batch call.
+    matrix_mem_copy(b_buffer_ptr + i * stride_b, host_a[i], ldb, lda, n, n,
+                    dpct::device_to_device, exec_queue);
+  }
+
+  {
+    auto b_buffer = get_buffer<Ty>(b_buffer_ptr);
+    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
+    sycl::buffer<std::int64_t, 1> ipiv_buf(
+        sycl::range<1>(batch_size * stride_ipiv));
+    auto from_buf = get_buffer<int>(ipiv);
+    exec_queue.submit([&](sycl::handler &cgh) {
+      auto from_acc = from_buf.get_access<sycl::access_mode::read>(cgh);
+      auto to_acc = ipiv_buf.get_access<sycl::access_mode::write>(cgh);
+      cgh.parallel_for<dpct_kernel_name<class getri_device_int64_to_int, T>>(
+          sycl::range<2>(batch_size, n), [=](sycl::id<2> id) {
+            to_acc[id.get(0) * stride_ipiv + id.get(1)] =
+                static_cast<std::int64_t>(from_acc[id.get(0) * n + id.get(1)]);
+          });
+    });
+
+    oneapi::mkl::lapack::getri_batch(exec_queue, n, b_buffer, ldb, stride_b, ipiv_buf,
+                             stride_ipiv, batch_size, scratchpad,
+                             scratchpad_size);
+  }
+
+  // Copy back to the original buffers
+  std::vector<sycl::event> events;
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    events.push_back(detail::dpct_memcpy(exec_queue, host_b[i],
+                                         b_buffer_ptr + i * stride_b,
+                                         n * ldb * sizeof(T), automatic));
+  std::vector<void *> ptrs{host_a, host_b};
+  std::thread mem_free_thread(
+      [=](std::vector<void *> pointers_array,
+          std::vector<sycl::event> events_array) {
+        sycl::event::wait(events_array);
+        for (auto p : pointers_array)
+          std::free(p);
+      },
+      ptrs, events);
+  mem_free_thread.detach();
+#else
+  std::int64_t n_int64 = n;
+  std::int64_t ldb_int64 = ldb;
+  std::int64_t group_sizes = batch_size;
+  std::int64_t scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<Ty>(
+      exec_queue, &n_int64, &ldb_int64, 1, &group_sizes);
+
+  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
+  std::int64_t *ipiv_int64 =
+      sycl::malloc_device<std::int64_t>(batch_size * n, exec_queue);
+  std::int64_t **ipiv_int64_ptr =
+      sycl::malloc_shared<std::int64_t *>(batch_size, exec_queue);
+
+  exec_queue.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for<dpct_kernel_name<class getri_device_int64_to_int, T>>(
+        sycl::range<1>(batch_size * n), [=](sycl::id<1> idx) {
+          ipiv_int64[idx] = static_cast<std::int64_t>(ipiv[idx]);
+        });
+  });
+
+  T **a_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
+  T **b_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
+  exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *));
+  exec_queue.memcpy(b_shared, b, batch_size * sizeof(T *)).wait();
+  for (std::int64_t i = 0; i < batch_size; ++i) {
+    ipiv_int64_ptr[i] = ipiv_int64 + n * i;
+    // Need to create a copy of input matrices "a" to keep them unchanged.
+    // Matrices "b" (copy of matrices "a") will be used as input and output
+    // parameter in oneapi::mkl::lapack::getri_batch call.
+    matrix_mem_copy(b_shared[i], a_shared[i], ldb, lda, n, n, dpct::device_to_device,
+                    exec_queue);
+  }
+
+  sycl::event e = oneapi::mkl::lapack::getri_batch(
+      exec_queue, &n_int64, (Ty **)b_shared, &ldb_int64, ipiv_int64_ptr, 1,
+      &group_sizes, scratchpad, scratchpad_size);
+
+  std::vector<void *> ptrs{scratchpad, ipiv_int64_ptr, ipiv_int64, a_shared, b_shared};
+  async_dpct_free(ptrs, {e}, exec_queue);
+#endif
+}
+
+/// Computes the QR factorizations of a batch of general matrices.
+/// \param [in] exec_queue The queue where the routine should be executed.
+/// \param [in] m The number of rows in the matrices.
+/// \param [in] n The number of columns in the matrices.
+/// \param [in, out] a Array of pointers to matrices. These
+/// matrices will be overwritten by the factorization data.
+/// \param [in] lda The leading dimension of the matrices in \p a.
+/// \param [out] tau An array stores the scalars.
+/// \param [out] info A value stores the error information.
+/// \param [in] batch_size The size of the batch.
+template <typename T>
+inline void geqrf_batch_wrapper(sycl::queue exec_queue, int m, int n,
+                                T *a[], int lda, T *tau[], int *info,
+                                int batch_size) {
+  using Ty = typename DataType<T>::T2;
+  // Set the info value to 0
+  *info = 0;
+#ifdef DPCT_USM_LEVEL_NONE
+  std::int64_t stride_a = n * lda;
+  std::int64_t stride_tau = std::max(1, std::min(m, n));
+  std::int64_t scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<Ty>(
+      exec_queue, m, n, lda, stride_a, stride_tau, batch_size);
+
+  T *a_buffer_ptr, *tau_buffer_ptr;
+  a_buffer_ptr = (T *)dpct_malloc(stride_a * batch_size * sizeof(T));
+  tau_buffer_ptr = (T *)dpct_malloc(stride_tau * batch_size * sizeof(T));
+
+  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
+  T **host_tau = (T **)std::malloc(batch_size * sizeof(T *));
+  dpct_memcpy(host_a, a, batch_size * sizeof(T *));
+  dpct_memcpy(host_tau, tau, batch_size * sizeof(T *));
+
+  for (std::int64_t i = 0; i < batch_size; ++i)
+    dpct_memcpy(a_buffer_ptr + i * stride_a, host_a[i], n * lda * sizeof(T));
+  {
+    auto a_buffer = get_buffer<Ty>(a_buffer_ptr);
+    auto tau_buffer = get_buffer<Ty>(tau_buffer_ptr);
+    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
+    oneapi::mkl::lapack::geqrf_batch(exec_queue, m, n, a_buffer, lda, stride_a,
+                             tau_buffer, stride_tau, batch_size, scratchpad,
+                             scratchpad_size);
+  }
+
+  // Copy back to the original buffers
+  std::vector<sycl::event> events_a;
+  std::vector<sycl::event> events_tau;
+  for (std::int64_t i = 0; i < batch_size; ++i) {
+    events_a.push_back(detail::dpct_memcpy(exec_queue, host_a[i],
+                                           a_buffer_ptr + i * stride_a,
+                                           n * lda * sizeof(T), automatic));
+    events_tau.push_back(detail::dpct_memcpy(
+        exec_queue, host_tau[i], tau_buffer_ptr + i * stride_tau,
+        std::max(1, std::min(m, n)) * sizeof(T), automatic));
+  }
+  std::vector<void *> ptr_a{host_a};
+  std::vector<void *> ptr_tau{host_tau};
+  std::thread mem_free_thread_a(
+      [=](std::vector<void *> pointers_array,
+          std::vector<sycl::event> events_array) {
+        sycl::event::wait(events_array);
+        for (auto p : pointers_array)
+          std::free(p);
+      },
+      ptr_a, events_a);
+  std::thread mem_free_thread_tau(
+      [=](std::vector<void *> pointers_array,
+          std::vector<sycl::event> events_array) {
+        sycl::event::wait(events_array);
+        for (auto p : pointers_array)
+          std::free(p);
+      },
+      ptr_tau, events_tau);
+  mem_free_thread_a.detach();
+  mem_free_thread_tau.detach();
+#else
+  std::int64_t m_int64 = n;
+  std::int64_t n_int64 = n;
+  std::int64_t lda_int64 = lda;
+  std::int64_t group_sizes = batch_size;
+  std::int64_t scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<Ty>(
+      exec_queue, &m_int64, &n_int64, &lda_int64, 1, &group_sizes);
+
+  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
+  T **a_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
+  T **tau_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
+  exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *));
+  exec_queue.memcpy(tau_shared, tau, batch_size * sizeof(T *)).wait();
+
+  sycl::event e = oneapi::mkl::lapack::geqrf_batch(
+      exec_queue, &m_int64, &n_int64, (Ty **)a_shared, &lda_int64, (Ty **)tau_shared, 1,
+      &group_sizes, scratchpad, scratchpad_size);
+
+  std::vector<void *> ptrs{scratchpad, a_shared, tau_shared};
+  async_dpct_free(ptrs, {e}, exec_queue);
+#endif
+}
+
+/// Computes the Euclidean norm of a vector.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] n Number of elements in vector x.
+/// \param [in] x Input vector x.
+/// \param [in] x_type Data type of the vector x.
+/// \param [in] incx Stride of vector x.
+/// \param [out] result The result scalar.
+/// \param [in] result_type Data type of the result.
+inline void nrm2(sycl::queue &q, int n, const void *x, library_data_t x_type,
+                    int incx, void *result, library_data_t result_type) {
+  std::uint64_t key = detail::get_type_combination_id(x_type, result_type);
+  switch (key) {
+  case detail::get_type_combination_id(library_data_t::real_float,
+                       library_data_t::real_float): {
+    detail::nrm2_impl<float, float>(q, n, x, incx, result);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_double,
+                       library_data_t::real_double): {
+    detail::nrm2_impl<double, double>(q, n, x, incx, result);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_float,
+                       library_data_t::real_float): {
+    detail::nrm2_impl<std::complex<float>, float>(
+        q, n, x, incx, result);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_double,
+                       library_data_t::real_double): {
+    detail::nrm2_impl<std::complex<double>, double>(
+        q, n, x, incx, result);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_half,
+                       library_data_t::real_half): {
+    detail::nrm2_impl<sycl::half, sycl::half>(
+        q, n, x, incx, result);
+    break;
+  }
+  default:
+    throw std::runtime_error("the combination of data type is unsupported");
+  }
+}
+
+/// Computes the dot product of two vectors.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] n Number of elements in vector x.
+/// \param [in] x Input vector x.
+/// \param [in] x_type Data type of the vector x.
+/// \param [in] incx Stride of vector x.
+/// \param [in] y Input vector y.
+/// \param [in] y_type Data type of the vector y.
+/// \param [in] incy Stride of vector y.
+/// \param [out] result The result scalar.
+/// \param [in] result_type Data type of the result.
+inline void dot(sycl::queue &q, int n, const void *x, library_data_t x_type,
+                   int incx, const void *y, library_data_t y_type, int incy,
+                   void *result, library_data_t result_type) {
+  detail::dotuc<false>(q, n, x, x_type, incx, y, y_type, incy, result,
+                          result_type);
+}
+
+/// Computes the dot product of two vectors, conjugating the first vector.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] n Number of elements in vector x.
+/// \param [in] x Input vector x.
+/// \param [in] x_type Data type of the vector x.
+/// \param [in] incx Stride of vector x.
+/// \param [in] y Input vector y.
+/// \param [in] y_type Data type of the vector y.
+/// \param [in] incy Stride of vector y.
+/// \param [out] result The result scalar.
+/// \param [in] result_type Data type of the result.
+inline void dotc(sycl::queue &q, int n, const void *x, library_data_t x_type,
+                    int incx, const void *y, library_data_t y_type, int incy,
+                    void *result, library_data_t result_type) {
+  detail::dotuc<true>(q, n, x, x_type, incx, y, y_type, incy, result,
+                         result_type);
+}
+
+/// Computes the product of a vector by a scalar.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] n Number of elements in vector x.
+/// \param [in] alpha The scale factor alpha.
+/// \param [in] alpha_type The data type of alpha.
+/// \param [in, out] x Input/Output vector x.
+/// \param [in] x_type Data type of the vector x.
+/// \param [in] incx Stride of vector x.
+inline void scal(sycl::queue &q, int n, const void *alpha,
+                    library_data_t alpha_type, void *x, library_data_t x_type,
+                    int incx) {
+  std::uint64_t key = detail::get_type_combination_id(x_type);
+  switch (key) {
+  case detail::get_type_combination_id(library_data_t::real_float): {
+    detail::scal_impl<float, float>(q, n, alpha, x, incx);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_double): {
+    detail::scal_impl<double, double>(q, n, alpha, x, incx);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_float): {
+    detail::scal_impl<std::complex<float>, std::complex<float>>(q, n, alpha,
+                                                                   x, incx);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_double): {
+    detail::scal_impl<std::complex<double>, std::complex<double>>(
+        q, n, alpha, x, incx);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_half): {
+    float alpha_value =
+        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+    sycl::half alaph_half(alpha_value);
+    detail::scal_impl<sycl::half, sycl::half>(q, n, &alaph_half, x, incx);
+    break;
+  }
+  default:
+    throw std::runtime_error("the combination of data type is unsupported");
+  }
+}
+
+/// Computes a vector-scalar product and adds the result to a vector.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] n Number of elements in vector x.
+/// \param [in] alpha The scale factor alpha.
+/// \param [in] alpha_type The data type of alpha.
+/// \param [in] x Input vector x.
+/// \param [in] x_type Data type of the vector x.
+/// \param [in] incx Stride of vector x.
+/// \param [in, out] y Input/Output vector y.
+/// \param [in] y_type Data type of the vector y.
+/// \param [in] incy Stride of vector y.
+inline void axpy(sycl::queue &q, int n, const void *alpha,
+                    library_data_t alpha_type, const void *x, library_data_t x_type,
+                    int incx, void *y, library_data_t y_type, int incy) {
+  std::uint64_t key = detail::get_type_combination_id(x_type, alpha_type);
+  switch (key) {
+  case detail::get_type_combination_id(library_data_t::real_float,
+                       library_data_t::real_float): {
+    detail::axpy_impl<float, float>(q, n, alpha, x, incx, y, incy);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_double,
+                       library_data_t::real_double): {
+    detail::axpy_impl<double, double>(q, n, alpha, x, incx, y, incy);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_float,
+                       library_data_t::complex_float): {
+    detail::axpy_impl<std::complex<float>, std::complex<float>>(
+        q, n, alpha, x, incx, y, incy);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_double,
+                       library_data_t::complex_double): {
+    detail::axpy_impl<std::complex<double>, std::complex<double>>(
+        q, n, alpha, x, incx, y, incy);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_half,
+                       library_data_t::real_float): {
+    float alpha_value =
+        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+    sycl::half alaph_half(alpha_value);
+    detail::axpy_impl<sycl::half, sycl::half>(q, n, &alaph_half, x, incx, y, incy);
+    break;
+  }
+  default:
+    throw std::runtime_error("the combination of data type is unsupported");
+  }
+}
+
+/// Performs rotation of points in the plane.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] n Number of elements in vector x.
+/// \param [in, out] x Input/Output vector x.
+/// \param [in] x_type Data type of the vector x.
+/// \param [in] incx Stride of vector x.
+/// \param [in, out] y Input/Output vector y.
+/// \param [in] y_type Data type of the vector y.
+/// \param [in] incy Stride of vector y.
+/// \param [in] c Scaling factor.
+/// \param [in] s Scaling factor.
+/// \param [in] cs_type Data type of the scaling factors.
+inline void rot(sycl::queue &q, int n, void *x, library_data_t x_type,
+                   int incx, void *y, library_data_t y_type, int incy,
+                   const void *c, const void *s, library_data_t cs_type) {
+  std::uint64_t key = detail::get_type_combination_id(x_type, cs_type);
+  switch (key) {
+  case detail::get_type_combination_id(library_data_t::real_float,
+                       library_data_t::real_float): {
+    detail::rot_impl<float, float, float>(q, n, x, incx, y, incy, c, s);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_double,
+                       library_data_t::real_double): {
+    detail::rot_impl<double, double, double>(q, n, x, incx, y, incy, c, s);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_float,
+                       library_data_t::real_float): {
+    detail::rot_impl<std::complex<float>, float, float>(q, n, x, incx, y, incy, c,
+                                                    s);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_double,
+                       library_data_t::real_double): {
+    detail::rot_impl<std::complex<double>, double, double>(q, n, x, incx, y, incy, c,
+                                                      s);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_float,
+                       library_data_t::complex_float): {
+    detail::rot_impl<std::complex<float>, float, std::complex<float>>(q, n, x, incx, y, incy, c, s);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_double,
+                       library_data_t::complex_double): {
+    detail::rot_impl<std::complex<double>, double, std::complex<double>>(q, n, x, incx, y, incy, c, s);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_half,
+                       library_data_t::real_half): {
+    detail::rot_impl<sycl::half, sycl::half, sycl::half>(q, n, x, incx, y, incy, c, s);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_bfloat16,
+                       library_data_t::real_bfloat16): {
+    detail::rot_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16>(q, n, x, incx, y, incy, c, s);
+    break;
+  }
+  default:
+    throw std::runtime_error("the combination of data type is unsupported");
+  }
+}
+
+/// Computes matrix-matrix product with general matrices.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] a_trans Specifies the operation applied to A.
+/// \param [in] b_trans Specifies the operation applied to B.
+/// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+/// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+/// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+/// \param [in] alpha Scaling factor for the matrix-matrix product.
+/// \param [in] a Input matrix A.
+/// \param [in] a_type Data type of the matrix A.
+/// \param [in] lda Leading dimension of A.
+/// \param [in] b Input matrix B.
+/// \param [in] b_type Data type of the matrix B.
+/// \param [in] ldb Leading dimension of B.
+/// \param [in] beta Scaling factor for matrix C.
+/// \param [in, out] c Input/Output matrix C.
+/// \param [in] c_type Data type of the matrix C.
+/// \param [in] ldc Leading dimension of C.
+/// \param [in] scaling_type Data type of the scaling factors.
+inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                 oneapi::mkl::transpose b_trans, int m, int n, int k,
+                 const void *alpha, const void *a, library_data_t a_type,
+                 int lda, const void *b, library_data_t b_type, int ldb,
+                 const void *beta, void *c, library_data_t c_type, int ldc,
+                 library_data_t scaling_type) {
+  bool matched = false;
+  if (scaling_type == library_data_t::real_float &&
+      c_type == library_data_t::complex_float) {
+    scaling_type = library_data_t::complex_float;
+  } else if (scaling_type == library_data_t::real_double &&
+             c_type == library_data_t::complex_double) {
+    scaling_type = library_data_t::complex_double;
+  }
+
+  std::uint64_t key =
+      detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+  switch (key) {
+  case detail::get_type_combination_id(
+      library_data_t::real_float, library_data_t::real_float,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_impl<float, float, float, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_double, library_data_t::real_double,
+      library_data_t::real_double, library_data_t::real_double): {
+    detail::gemm_impl<double, double, double, double>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::complex_float, library_data_t::complex_float,
+      library_data_t::complex_float, library_data_t::complex_float): {
+    detail::gemm_impl<std::complex<float>, std::complex<float>,
+                      std::complex<float>, std::complex<float>>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::complex_double, library_data_t::complex_double,
+      library_data_t::complex_double, library_data_t::complex_double): {
+    detail::gemm_impl<std::complex<double>, std::complex<double>,
+                      std::complex<double>, std::complex<double>>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_half, library_data_t::real_half,
+      library_data_t::real_half, library_data_t::real_half): {
+    detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                      sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
+                                      lda, b, ldb, beta, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                      float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
+                             ldb, beta, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_half, library_data_t::real_half,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_impl<sycl::half, sycl::half, float, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_half, library_data_t::real_half,
+      library_data_t::real_half, library_data_t::real_float): {
+    float alpha_value =
+        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+    float beta_value =
+        dpct::get_value(reinterpret_cast<const float *>(beta), q);
+    sycl::half alpha_half(alpha_value);
+    sycl::half beta_half(beta_value);
+    detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                      sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
+                                      a, lda, b, ldb, &beta_half, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_int8, library_data_t::real_int8,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+      library_data_t::real_bfloat16, library_data_t::real_float): {
+    detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                      oneapi::mkl::bfloat16, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_int8, library_data_t::real_int8,
+      library_data_t::real_int32, library_data_t::real_int32): {
+    float alpha_float =
+        dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+    float beta_float =
+        dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+    detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
+        q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
+    break;
+  }
+  default:
+    throw std::runtime_error("the combination of data type is unsupported");
+  }
+}
+
+/// Computes a batch of matrix-matrix product with general matrices.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] a_trans Specifies the operation applied to A.
+/// \param [in] b_trans Specifies the operation applied to B.
+/// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+/// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+/// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+/// \param [in] alpha Scaling factor for the matrix-matrix product.
+/// \param [in] a Input matrix A.
+/// \param [in] a_type Data type of the matrix A.
+/// \param [in] lda Leading dimension of A.
+/// \param [in] b Input matrix B.
+/// \param [in] b_type Data type of the matrix B.
+/// \param [in] ldb Leading dimension of B.
+/// \param [in] beta Scaling factor for matrix C.
+/// \param [in, out] c Input/Output matrix C.
+/// \param [in] c_type Data type of the matrix C.
+/// \param [in] ldc Leading dimension of C.
+/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+/// \param [in] scaling_type Data type of the scaling factors.
+inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                       oneapi::mkl::transpose b_trans, int m, int n, int k,
+                       const void *alpha, const void *a[],
+                       library_data_t a_type, int lda, const void *b[],
+                       library_data_t b_type, int ldb, const void *beta,
+                       void *c[], library_data_t c_type, int ldc,
+                       int batch_size, library_data_t scaling_type) {
+#ifdef DPCT_USM_LEVEL_NONE
+  throw std::runtime_error("this API is unsupported when USM level is none");
+#else
+  bool matched = false;
+  if (scaling_type == library_data_t::real_float &&
+      c_type == library_data_t::complex_float) {
+    scaling_type = library_data_t::complex_float;
+  } else if (scaling_type == library_data_t::real_double &&
+             c_type == library_data_t::complex_double) {
+    scaling_type = library_data_t::complex_double;
+  }
+
+  std::uint64_t key =
+      detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+  switch (key) {
+  case detail::get_type_combination_id(
+      library_data_t::real_float, library_data_t::real_float,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_batch_impl<float, float, float, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+        batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_double, library_data_t::real_double,
+      library_data_t::real_double, library_data_t::real_double): {
+    detail::gemm_batch_impl<double, double, double, double>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+        batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::complex_float, library_data_t::complex_float,
+      library_data_t::complex_float, library_data_t::complex_float): {
+    detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                            std::complex<float>, std::complex<float>>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+        batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::complex_double, library_data_t::complex_double,
+      library_data_t::complex_double, library_data_t::complex_double): {
+    detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                            std::complex<double>, std::complex<double>>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+        batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_half, library_data_t::real_half,
+      library_data_t::real_half, library_data_t::real_half): {
+    detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                            sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                            a, lda, b, ldb, beta, c, ldc,
+                                            batch_size);
+    break;
+  }
+#ifdef __INTEL_MKL__
+  case detail::get_type_combination_id(
+      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+      library_data_t::real_bfloat16, library_data_t::real_float): {
+    detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                            oneapi::mkl::bfloat16, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+        batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                            float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                   b, ldb, beta, c, ldc, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_int8, library_data_t::real_int8,
+      library_data_t::real_int32, library_data_t::real_int32): {
+    float alpha_float =
+        dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+    float beta_float =
+        dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+    detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                            float>(q, a_trans, b_trans, m, n, k, &alpha_float,
+                                          a, lda, b, ldb, &beta_float, c, ldc,
+                                          batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_int8, library_data_t::real_int8,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+        batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_half, library_data_t::real_half,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+        batch_size);
+    break;
+  }
+#endif
+  case detail::get_type_combination_id(
+      library_data_t::real_half, library_data_t::real_half,
+      library_data_t::real_half, library_data_t::real_float): {
+    float alpha_value =
+        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+    float beta_value =
+        dpct::get_value(reinterpret_cast<const float *>(beta), q);
+    sycl::half alpha_half(alpha_value);
+    sycl::half beta_half(beta_value);
+    detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+        q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
+        batch_size);
+    break;
+  }
+  default:
+    throw std::runtime_error("the combination of data type is unsupported");
+  }
+#endif
+}
+
+/// Computes a batch of matrix-matrix product with general matrices.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] a_trans Specifies the operation applied to A.
+/// \param [in] b_trans Specifies the operation applied to B.
+/// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+/// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+/// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+/// \param [in] alpha Scaling factor for the matrix-matrix product.
+/// \param [in] a Input matrix A.
+/// \param [in] a_type Data type of the matrix A.
+/// \param [in] lda Leading dimension of A.
+/// \param [in] stride_a Stride between the different A matrices.
+/// \param [in] b Input matrix B.
+/// \param [in] b_type Data type of the matrix B.
+/// \param [in] ldb Leading dimension of B.
+/// \param [in] stride_b Stride between the different B matrices.
+/// \param [in] beta Scaling factor for matrix C.
+/// \param [in, out] c Input/Output matrix C.
+/// \param [in] c_type Data type of the matrix C.
+/// \param [in] ldc Leading dimension of C.
+/// \param [in] stride_c Stride between the different C matrices.
+/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+/// \param [in] scaling_type Data type of the scaling factors.
+inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                       oneapi::mkl::transpose b_trans, int m, int n, int k,
+                       const void *alpha, const void *a, library_data_t a_type,
+                       int lda, long long int stride_a, const void *b,
+                       library_data_t b_type, int ldb, long long int stride_b,
+                       const void *beta, void *c, library_data_t c_type,
+                       int ldc, long long int stride_c, int batch_size,
+                       library_data_t scaling_type) {
+  bool matched = false;
+  if (scaling_type == library_data_t::real_float &&
+      c_type == library_data_t::complex_float) {
+    scaling_type = library_data_t::complex_float;
+  } else if (scaling_type == library_data_t::real_double &&
+             c_type == library_data_t::complex_double) {
+    scaling_type = library_data_t::complex_double;
+  }
+
+  std::uint64_t key =
+      detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+  switch (key) {
+  case detail::get_type_combination_id(
+      library_data_t::real_float, library_data_t::real_float,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_batch_impl<float, float, float, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+        beta, c, ldc, stride_c, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_double, library_data_t::real_double,
+      library_data_t::real_double, library_data_t::real_double): {
+    detail::gemm_batch_impl<double, double, double, double>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+        beta, c, ldc, stride_c, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::complex_float, library_data_t::complex_float,
+      library_data_t::complex_float, library_data_t::complex_float): {
+    detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                            std::complex<float>, std::complex<float>>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+        beta, c, ldc, stride_c, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::complex_double, library_data_t::complex_double,
+      library_data_t::complex_double, library_data_t::complex_double): {
+    detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                            std::complex<double>, std::complex<double>>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+        beta, c, ldc, stride_c, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_half, library_data_t::real_half,
+      library_data_t::real_half, library_data_t::real_half): {
+    detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                            sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                            a, lda, stride_a, b, ldb, stride_b,
+                                            beta, c, ldc, stride_c, batch_size);
+    break;
+  }
+#ifdef __INTEL_MKL__
+  case detail::get_type_combination_id(
+      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+      library_data_t::real_bfloat16, library_data_t::real_float): {
+    detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                            oneapi::mkl::bfloat16, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+        beta, c, ldc, stride_c, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                            float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                   stride_a, b, ldb, stride_b, beta, c, ldc,
+                                   stride_c, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_int8, library_data_t::real_int8,
+      library_data_t::real_int32, library_data_t::real_int32): {
+    detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                            std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
+                                          a, lda, stride_a, b, ldb, stride_b,
+                                          beta, c, ldc, stride_c, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_int8, library_data_t::real_int8,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+        beta, c, ldc, stride_c, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(
+      library_data_t::real_half, library_data_t::real_half,
+      library_data_t::real_float, library_data_t::real_float): {
+    detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+        beta, c, ldc, stride_c, batch_size);
+    break;
+  }
+#endif
+  case detail::get_type_combination_id(
+      library_data_t::real_half, library_data_t::real_half,
+      library_data_t::real_half, library_data_t::real_float): {
+    float alpha_value =
+        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+    float beta_value =
+        dpct::get_value(reinterpret_cast<const float *>(beta), q);
+    sycl::half alpha_half(alpha_value);
+    sycl::half beta_half(beta_value);
+    detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+        q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
+        &beta_half, c, ldc, stride_c, batch_size);
+    break;
+  }
+  default:
+    throw std::runtime_error("the combination of data type is unsupported");
+  }
+}
+
+/// This routines perform a special rank-k update of a symmetric matrix C by
+/// general matrices A and B.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] uplo Specifies whether C's data is stored in its upper or lower triangle.
+/// \param [in] trans Specifies the operation to apply.
+/// \param [in] n The number of rows and columns in C.
+/// \param [in] k The inner dimension of matrix multiplications.
+/// \param [in] alpha Scaling factor for the rank-k update.
+/// \param [in] a Input matrix A.
+/// \param [in] lda Leading dimension of A.
+/// \param [in] b Input matrix B.
+/// \param [in] ldb Leading dimension of B.
+/// \param [in] beta Scaling factor for the rank-k update.
+/// \param [in, out] c Input/Output matrix C.
+/// \param [in] ldc Leading dimension of C.
+template <class T>
+inline void syrk(sycl::queue &q, oneapi::mkl::uplo uplo,
+                  oneapi::mkl::transpose trans, int n, int k, const T *alpha,
+                  const T *a, int lda, const T *b, int ldb, const T *beta, T *c,
+                  int ldc) {
+  detail::rk_impl<false, T, T>(q, uplo, trans, n, k, alpha, a, lda, b,
+                                     ldb, beta, c, ldc);
+}
+
+/// This routines perform a special rank-k update of a Hermitian matrix C by
+/// general matrices A and B.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] uplo Specifies whether C's data is stored in its upper or lower triangle.
+/// \param [in] trans Specifies the operation to apply.
+/// \param [in] n The number of rows and columns in C.
+/// \param [in] k The inner dimension of matrix multiplications.
+/// \param [in] alpha Scaling factor for the rank-k update.
+/// \param [in] a Input matrix A.
+/// \param [in] lda Leading dimension of A.
+/// \param [in] b Input matrix B.
+/// \param [in] ldb Leading dimension of B.
+/// \param [in] beta Scaling factor for the rank-k update.
+/// \param [in, out] c Input/Output matrix C.
+/// \param [in] ldc Leading dimension of C.
+template <class T, class Tbeta>
+inline void herk(sycl::queue &q, oneapi::mkl::uplo uplo,
+                 oneapi::mkl::transpose trans, int n, int k, const T *alpha,
+                 const T *a, int lda, const T *b, int ldb, const Tbeta *beta,
+                 T *c, int ldc) {
+  detail::rk_impl<true, T, Tbeta>(q, uplo, trans, n, k, alpha, a, lda, b,
+                                        ldb, beta, c, ldc);
+}
+
+/// This routine performs a group of trsm operations. Each trsm solves an
+/// equation of the form op(A) * X = alpha * B or X * op(A) = alpha * B.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] left_right Specifies A multiplies X on the left or on the right.
+/// \param [in] upper_lower Specifies A is upper or lower triangular.
+/// \param [in] trans Specifies the operation applied to A.
+/// \param [in] unit_diag Specifies whether A is unit triangular.
+/// \param [in] m Number of rows of the B matrices.
+/// \param [in] n Number of columns of the B matrices.
+/// \param [in] alpha Scaling factor for the solutions.
+/// \param [in] a Input matrices A.
+/// \param [in] a_type Data type of the matrices A.
+/// \param [in] lda Leading dimension of the matrices A.
+/// \param [in, out] b Input and output matrices B.
+/// \param [in] b_type Data type of the matrices B.
+/// \param [in] ldb Leading dimension of the matrices B.
+/// \param [in] batch_size Specifies the number of trsm operations to perform.
+/// \param [in] scaling_type Data type of the scaling factors.
+inline void trsm_batch(sycl::queue &q, oneapi::mkl::side left_right,
+                       oneapi::mkl::uplo upper_lower,
+                       oneapi::mkl::transpose trans,
+                       oneapi::mkl::diag unit_diag, int m, int n,
+                       const void *alpha, const void **a, library_data_t a_type,
+                       int lda, void **b, library_data_t b_type, int ldb,
+                       int batch_size, library_data_t scaling_type) {
+#ifdef DPCT_USM_LEVEL_NONE
+  throw std::runtime_error("this API is unsupported when USM level is none");
+#else
+  std::uint64_t key =
+      detail::get_type_combination_id(a_type, b_type, scaling_type);
+  switch (key) {
+  case detail::get_type_combination_id(library_data_t::real_float,
+                                       library_data_t::real_float,
+                                       library_data_t::real_float): {
+    detail::trsm_batch_impl<float, float, float>(q, left_right, upper_lower,
+                                                 trans, unit_diag, m, n, alpha,
+                                                 a, lda, b, ldb, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::real_double,
+                                       library_data_t::real_double,
+                                       library_data_t::real_double): {
+    detail::trsm_batch_impl<double, double, double>(
+        q, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
+        ldb, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_float,
+                                       library_data_t::complex_float,
+                                       library_data_t::complex_float): {
+    detail::trsm_batch_impl<std::complex<float>, std::complex<float>,
+                            std::complex<float>>(q, left_right, upper_lower,
+                                                 trans, unit_diag, m, n, alpha,
+                                                 a, lda, b, ldb, batch_size);
+    break;
+  }
+  case detail::get_type_combination_id(library_data_t::complex_double,
+                                       library_data_t::complex_double,
+                                       library_data_t::complex_double): {
+    detail::trsm_batch_impl<std::complex<double>, std::complex<double>,
+                            std::complex<double>>(q, left_right, upper_lower,
+                                                  trans, unit_diag, m, n, alpha,
+                                                  a, lda, b, ldb, batch_size);
+    break;
+  }
+  default:
+    throw std::runtime_error("the combination of data type is unsupported");
+  }
+#endif
+}
+
+/// Computes a triangular matrix-general matrix product.
+/// \param [in] q The queue where the routine should be executed.
+/// \param [in] left_right Specifies A is on the left or right side of the
+/// multiplication.
+/// \param [in] upper_lower Specifies A is upper or lower triangular.
+/// \param [in] trans Specifies the operation applied to A.
+/// \param [in] unit_diag Specifies whether A is unit triangular.
+/// \param [in] m Number of rows of B.
+/// \param [in] n Number of columns of B.
+/// \param [in] alpha Scaling factor for the matrix-matrix product.
+/// \param [in] a Input matrices A.
+/// \param [in] lda Leading dimension of the matrices A.
+/// \param [in] b Input matrices B.
+/// \param [in] ldb Leading dimension of the matrices B.
+/// \param [out] c Output matrices C.
+/// \param [in] ldc Leading dimension of the matrices C.
+template <class T>
+inline void trmm(sycl::queue &q, oneapi::mkl::side left_right,
+                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
+                 oneapi::mkl::diag unit_diag, int m, int n, const T *alpha,
+                 const T *a, int lda, const T *b, int ldb, T *c, int ldc) {
+  using Ty = typename DataType<T>::T2;
+  auto alpha_val = dpct::get_value(alpha, q);
+  if (b != c) {
+    dpct::matrix_mem_copy(c, b, ldc, ldb, m, n, dpct::device_to_device, q);
+  }
+  auto data_a = detail::get_memory<const Ty>(a);
+  auto data_c = detail::get_memory<Ty>(c);
+  oneapi::mkl::blas::column_major::trmm(q, left_right, upper_lower, trans,
+                                        unit_diag, m, n, alpha_val, data_a, lda,
+                                        data_c, ldc);
+}
+
+} // namespace dpct
+#endif // __DPCT_BLAS_UTILS_HPP__
diff --git a/dpct/ccl_utils.hpp b/dpct/ccl_utils.hpp
new file mode 100644
index 0000000000000..07b3488c937da
--- /dev/null
+++ b/dpct/ccl_utils.hpp
@@ -0,0 +1,286 @@
+//==---- ccl_utils.hpp----------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_CCL_UTILS_HPP__
+#define __DPCT_CCL_UTILS_HPP__
+
+#include <sycl/sycl.hpp>
+#include <oneapi/ccl.hpp>
+#include <unordered_map>
+#include <memory>
+
+#include "device.hpp"
+
+namespace dpct {
+namespace ccl {
+namespace detail {
+
+/// Get stored kvs with specified kvs address.
+inline std::shared_ptr<oneapi::ccl::kvs> &
+get_kvs(const oneapi::ccl::kvs::address_type &addr) {
+  struct hash {
+    std::size_t operator()(const oneapi::ccl::kvs::address_type &in) const {
+      return std::hash<std::string_view>()(std::string_view(in.data(), in.size()));
+    }
+  };
+  static std::unordered_map<oneapi::ccl::kvs::address_type,
+                            std::shared_ptr<oneapi::ccl::kvs>, hash>
+      kvs_map;
+  return kvs_map[addr];
+}
+
+/// Help class to init ccl environment. 
+class ccl_init_helper {
+public:
+  ccl_init_helper() { oneapi::ccl::init(); }
+};
+
+} // namespace detail
+
+/// Get concatenated library version as an integer.
+static inline int get_version() {
+  oneapi::ccl::init();
+  auto ver = oneapi::ccl::get_library_version();
+  return ver.major * 10000 + ver.minor * 100 + ver.update;
+}
+
+/// Create main kvs and return its address.
+static inline oneapi::ccl::kvs::address_type create_kvs_address() {
+  oneapi::ccl::init();
+  auto ptr = oneapi::ccl::create_main_kvs();
+  auto addr = ptr->get_address();
+  detail::get_kvs(addr) = ptr;
+  return addr;
+}
+
+/// Get stored kvs with /p addr if exist. Otherwise, create kvs with /p addr.
+static inline std::shared_ptr<oneapi::ccl::kvs>
+create_kvs(const oneapi::ccl::kvs::address_type &addr) {
+  oneapi::ccl::init();
+  auto &ptr = detail::get_kvs(addr);
+  if (!ptr)
+    ptr = oneapi::ccl::create_kvs(addr);
+  return ptr;
+}
+
+/// dpct communicator extension
+class communicator_wrapper : public dpct::ccl::detail::ccl_init_helper {
+public:
+  communicator_wrapper(
+      int size, int rank, oneapi::ccl::kvs::address_type id,
+      const oneapi::ccl::comm_attr &attr = oneapi::ccl::default_comm_attr)
+      : _device_comm(oneapi::ccl::create_device(
+            static_cast<sycl::device &>(dpct::get_current_device()))),
+        _context_comm(oneapi::ccl::create_context(dpct::get_default_context())),
+        _comm(oneapi::ccl::create_communicator(
+            size, rank, _device_comm, _context_comm, dpct::ccl::create_kvs(id),
+            attr)) {
+    _queue_init = false;
+    _ccl_stream_ptr = nullptr;
+  }
+
+  ~communicator_wrapper() {
+    delete _ccl_stream_ptr;
+  };
+
+  /// Return the rank in a oneapi::ccl::communicator
+  /// \returns The rank corresponding to communicator object
+  int rank() const {
+    return _comm.rank();
+  }
+
+  /// Retrieves the number of rank in oneapi::ccl::communicator
+  /// \returns The number of the ranks
+  int size() const {
+    return _comm.size();
+  }
+
+  /// Return underlying native device, which was used in oneapi::ccl::communicator
+  sycl::device get_device() const {
+    return _comm.get_device().get_native();
+  }
+
+  /// \brief allreduce is a collective communication operation that performs the global reduction operation
+  ///       on values from all ranks of communicator and distributes the result back to all ranks.
+  /// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
+  /// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
+  /// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff
+  /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
+  /// \param rtype the type of the reduction operation to be applied
+  /// \param queue_ptr a sycl::queue ptr associated with the operation
+  /// \return @ref void
+  void allreduce(const void *sendbuff, void *recvbuff, size_t count,
+                 oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
+                 sycl::queue *queue_ptr) {
+    call_func_wrapper(
+        [=](const oneapi::ccl::stream &stream) {
+          return oneapi::ccl::allreduce(sendbuff, recvbuff, count, dtype, rtype,
+                                        _comm, stream);
+        },
+        queue_ptr);
+  }
+
+  /// \brief reduce is a collective communication operation that performs the
+  ///        global reduction operation on values from all ranks of the communicator
+  ///        and returns the result to the root rank.
+  /// \param sendbuff the buffer with @c count elements of @c dtype that stores
+  ///        local data to be reduced 
+  /// \param recvbuff [out] the buffer to store reduced result, 
+  ///        must have the same dimension as @c sendbuff 
+  /// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff 
+  /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff 
+  /// \param root the rank that gets the result of reduction 
+  /// \param rtype the type of the reduction operation to be applied 
+  /// \param queue_ptr a sycl::queue ptr associated with the operation 
+  /// \return @ref void
+  void reduce(const void *sendbuff, void *recvbuff, size_t count,
+              oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
+              int root, sycl::queue *queue_ptr) {
+    call_func_wrapper(
+        [=](const oneapi::ccl::stream &stream) {
+          return oneapi::ccl::reduce(sendbuff, recvbuff, count, dtype, rtype,
+                                     root, _comm, stream);
+        },
+        queue_ptr);
+  }
+
+  /// \brief broadcast is a collective communication operation that broadcasts data
+  ///        from one rank of communicator (denoted as root) to all other ranks.
+  ///        Only support in-place operation
+  /// \param sendbuff the buffer with @c count elements of @c dtype that stores
+  ///        local data to be reduced 
+  /// \param recvbuff [out] the buffer to store reduced result
+  /// \param count the number of elements of type @c dtype in @c buf 
+  /// \param dtype thedatatype of elements in @c buf 
+  /// \param root the rank that broadcasts @c buf
+  /// \param queue_ptr a sycl::queue ptr associated with the operation
+  /// \return @ref void
+  void broadcast(void *sendbuff, void *recvbuff, size_t count,
+                 oneapi::ccl::datatype dtype, int root,
+                 sycl::queue *queue_ptr) {
+    if (sendbuff != recvbuff) {
+      throw std::runtime_error(
+          "oneCCL broadcast only support in-place operation. "
+          "sendbuff and recvbuff must be same.");
+      return;
+    }
+    call_func_wrapper(
+        [=](const oneapi::ccl::stream &stream) {
+          return oneapi::ccl::broadcast(recvbuff, count, dtype, root, _comm,
+                                        stream);
+        },
+        queue_ptr);
+  }
+
+  /// \brief reduce_scatter is a collective communication operation that performs the global reduction operation
+  ///        on values from all ranks of the communicator and scatters the result in blocks back to all ranks.
+  /// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
+  /// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
+  /// \param recv_count the number of elements of type @c dtype in receive block
+  /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
+  /// \param rtype the type of the reduction operation to be applied
+  /// \param queue_ptr a sycl::queue ptr associated with the operation
+  /// \return @ref void
+  void reduce_scatter(const void *sendbuff, void *recvbuff, size_t recv_count,
+                      oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
+                      sycl::queue *queue_ptr) {
+    call_func_wrapper(
+        [=](const oneapi::ccl::stream &stream) {
+          return oneapi::ccl::reduce_scatter(sendbuff, recvbuff, recv_count,
+                                             dtype, rtype, _comm, stream);
+        },
+        queue_ptr);
+  }
+
+  /// \brief send is a pt2pt communication operation that sends data from one rank of communicator.
+  /// \param sendbuff the buffer with @c count elements of @c dtype serves as send buffer for root
+  /// \param count the number of elements of type @c dtype in @c sendbuff
+  /// \param dtype the datatype of elements in @c sendbuff
+  /// \param peer the rank that receives @c sendbuff
+  /// \param queue_ptr a sycl::queue ptr associated with the operation
+  /// \return @ref void
+  void send(void *sendbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
+            sycl::queue *queue_ptr) {
+    call_func_wrapper(
+        [=](const oneapi::ccl::stream &stream) {
+          return oneapi::ccl::send(sendbuff, count, dtype, peer, _comm, stream);
+        },
+        queue_ptr);
+  }
+
+  /// \brief recv is a pt2pt communication operation that sends data from one rank of communicator.
+  /// \param recvbuff the buffer with @c count elements of @c dtype serves as  receive buffer
+  /// \param count the number of elements of type @c dtype in @c recvbuff
+  /// \param dtype the datatype of elements in @c recvbuff
+  /// \param peer the rank that receives @c recvbuff
+  /// \param queue_ptr a sycl::queue ptr associated with the operation
+  /// \return @ref void
+  void recv(void *recvbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
+            sycl::queue *queue_ptr) {
+    call_func_wrapper(
+        [=](const oneapi::ccl::stream &stream) {
+          return oneapi::ccl::recv(recvbuff, count, dtype, peer, _comm, stream);
+        },
+        queue_ptr);
+  }
+
+private:
+  oneapi::ccl::device _device_comm;
+  oneapi::ccl::context _context_comm;
+  oneapi::ccl::communicator _comm;
+  sycl::queue _queue;
+  bool _queue_init;
+  oneapi::ccl::stream *_ccl_stream_ptr;
+
+  template <class Fn>
+  void call_func_wrapper(Fn func, sycl::queue *qptr) {
+    if (_queue_init && *qptr != _queue) {
+      call_func_async(func, qptr);
+    } else {
+      if(!_queue_init) {
+        _queue = *qptr;
+        _queue_init = true;
+        _ccl_stream_ptr = new oneapi::ccl::stream(oneapi::ccl::create_stream(_queue));
+      }
+      std::invoke(func, *_ccl_stream_ptr);
+    }
+  }
+
+  class call_func_async {
+    sycl::queue *_q_ptr;
+    struct call_async_impl {
+      oneapi::ccl::stream _ccl_stream_impl;
+      oneapi::ccl::event _ccl_event_impl;
+      template <class Fn>
+      explicit call_async_impl(Fn func, sycl::queue *qptr)
+          : _ccl_stream_impl(oneapi::ccl::create_stream(*qptr)),
+            _ccl_event_impl(std::invoke(func, _ccl_stream_impl)) {}
+    };
+    call_async_impl *_imp;
+
+  public:
+    template <class Fn>
+    explicit call_func_async(Fn func, sycl::queue *qptr)
+        : _q_ptr(qptr),
+          _imp(new call_async_impl(func, qptr)) {}
+    ~call_func_async() {
+      _q_ptr->submit([&](sycl::handler &cgh)
+                     { cgh.host_task([=]
+                                     {
+        _imp->_ccl_event_impl.wait();
+        delete _imp; }); });
+    }
+  };
+};
+
+typedef dpct::ccl::communicator_wrapper *comm_ptr;
+
+} // namespace ccl
+} // namespace dpct
+
+#endif // __DPCT_CCL_UTILS_HPP__
\ No newline at end of file
diff --git a/dpct/device.hpp b/dpct/device.hpp
new file mode 100644
index 0000000000000..729ebf625a472
--- /dev/null
+++ b/dpct/device.hpp
@@ -0,0 +1,781 @@
+//==---- device.hpp -------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_DEVICE_HPP__
+#define __DPCT_DEVICE_HPP__
+
+#include <sycl/sycl.hpp>
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <iostream>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <map>
+#include <vector>
+#include <thread>
+#if defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#endif
+#if defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+namespace dpct {
+namespace detail {
+static void get_version(const sycl::device &dev, int &major, int &minor) {
+  // Version string has the following format:
+  // a. OpenCL<space><major.minor><space><vendor-specific-information>
+  // b. <major.minor>
+  std::string ver;
+  ver = dev.get_info<sycl::info::device::version>();
+  std::string::size_type i = 0;
+  while (i < ver.size()) {
+    if (isdigit(ver[i]))
+      break;
+    i++;
+  }
+  major = std::stoi(&(ver[i]));
+  while (i < ver.size()) {
+    if (ver[i] == '.')
+      break;
+    i++;
+  }
+  i++;
+  minor = std::stoi(&(ver[i]));
+}
+} // namespace detail
+
+/// SYCL default exception handler
+inline auto exception_handler = [](sycl::exception_list exceptions) {
+  for (std::exception_ptr const &e : exceptions) {
+    try {
+      std::rethrow_exception(e);
+    } catch (sycl::exception const &e) {
+      std::cerr << "Caught asynchronous SYCL exception:" << std::endl
+                << e.what() << std::endl
+                << "Exception caught at file:" << __FILE__
+                << ", line:" << __LINE__ << std::endl;
+    }
+  }
+};
+
+typedef sycl::event *event_ptr;
+
+typedef sycl::queue *queue_ptr;
+
+typedef char *device_ptr;
+
+/// Destroy \p event pointed memory.
+///
+/// \param event Pointer to the sycl::event address.
+static void destroy_event(event_ptr event) {
+    delete event;
+}
+
+class device_info {
+public:
+  // get interface
+  const char *get_name() const { return _name; }
+  char *get_name() { return _name; }
+  template <typename WorkItemSizesTy = sycl::range<3>,
+            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                 std::is_same_v<WorkItemSizesTy, int *>,
+                             int> = 0>
+  auto get_max_work_item_sizes() const {
+    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+      return sycl::range<3>(_max_work_item_sizes_i[0],
+                            _max_work_item_sizes_i[1],
+                            _max_work_item_sizes_i[2]);
+    else {
+      return _max_work_item_sizes_i;
+    }  
+  }
+  template <typename WorkItemSizesTy = sycl::range<3>,
+            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                 std::is_same_v<WorkItemSizesTy, int *>,
+                             int> = 0>
+  auto get_max_work_item_sizes() {
+    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+      return sycl::range<3>(_max_work_item_sizes_i[0],
+                            _max_work_item_sizes_i[1],
+                            _max_work_item_sizes_i[2]);
+    else {
+      return _max_work_item_sizes_i;
+    }  
+  }
+  bool get_host_unified_memory() const { return _host_unified_memory; }
+  int get_major_version() const { return _major; }
+  int get_minor_version() const { return _minor; }
+  int get_integrated() const { return _integrated; }
+  int get_max_clock_frequency() const { return _frequency; }
+  int get_max_compute_units() const { return _max_compute_units; }
+  int get_max_work_group_size() const { return _max_work_group_size; }
+  int get_max_sub_group_size() const { return _max_sub_group_size; }
+  int get_max_work_items_per_compute_unit() const {
+    return _max_work_items_per_compute_unit;
+  }
+  int get_max_register_size_per_work_group() const {
+    return _max_register_size_per_work_group;
+  }
+  template <typename NDRangeSizeTy = size_t *,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                 std::is_same_v<NDRangeSizeTy, int *>,
+                             int> = 0>
+  auto get_max_nd_range_size() const {
+    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+      return _max_nd_range_size;
+    else
+      return _max_nd_range_size_i;
+  }
+  template <typename NDRangeSizeTy = size_t *,
+            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                 std::is_same_v<NDRangeSizeTy, int *>,
+                             int> = 0>
+  auto get_max_nd_range_size() {
+    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+      return _max_nd_range_size;
+    else
+      return _max_nd_range_size_i;
+  }
+  size_t get_global_mem_size() const { return _global_mem_size; }
+  size_t get_local_mem_size() const { return _local_mem_size; }
+  /// Returns the maximum clock rate of device's global memory in kHz. If
+  /// compiler does not support this API then returns default value 3200000 kHz.
+  unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
+  /// Returns the maximum bus width between device and memory in bits. If
+  /// compiler does not support this API then returns default value 64 bits.
+  unsigned int get_memory_bus_width() const { return _memory_bus_width; }
+  uint32_t get_device_id() const { return _device_id; }
+  std::array<unsigned char, 16> get_uuid() const { return _uuid; }
+  /// Returns global memory cache size in bytes.
+  unsigned int get_global_mem_cache_size() const {
+    return _global_mem_cache_size;
+  }
+
+  // set interface
+  void set_name(const char* name) {
+    size_t length = strlen(name);
+    if (length < 256) {
+      std::memcpy(_name, name, length + 1);
+    } else {
+      std::memcpy(_name, name, 255);
+      _name[255] = '\0';
+    }
+  }
+  void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) {
+    for (int i = 0; i < 3; ++i)
+      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+  }
+  [[deprecated]] void
+  set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) {
+    for (int i = 0; i < 3; ++i) {
+      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+    }
+  }
+  void set_host_unified_memory(bool host_unified_memory) {
+    _host_unified_memory = host_unified_memory;
+  }
+  void set_major_version(int major) { _major = major; }
+  void set_minor_version(int minor) { _minor = minor; }
+  void set_integrated(int integrated) { _integrated = integrated; }
+  void set_max_clock_frequency(int frequency) { _frequency = frequency; }
+  void set_max_compute_units(int max_compute_units) {
+    _max_compute_units = max_compute_units;
+  }
+  void set_global_mem_size(size_t global_mem_size) {
+    _global_mem_size = global_mem_size;
+  }
+  void set_local_mem_size(size_t local_mem_size) {
+    _local_mem_size = local_mem_size;
+  }
+  void set_max_work_group_size(int max_work_group_size) {
+    _max_work_group_size = max_work_group_size;
+  }
+  void set_max_sub_group_size(int max_sub_group_size) {
+    _max_sub_group_size = max_sub_group_size;
+  }
+  void
+  set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) {
+    _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
+  }
+  void set_max_nd_range_size(int max_nd_range_size[]) {
+    for (int i = 0; i < 3; i++) {
+      _max_nd_range_size[i] = max_nd_range_size[i];
+      _max_nd_range_size_i[i] = max_nd_range_size[i];
+    }
+  }
+  void set_memory_clock_rate(unsigned int memory_clock_rate) {
+    _memory_clock_rate = memory_clock_rate;
+  }
+  void set_memory_bus_width(unsigned int memory_bus_width) {
+    _memory_bus_width = memory_bus_width;
+  }
+  void
+  set_max_register_size_per_work_group(int max_register_size_per_work_group) {
+    _max_register_size_per_work_group = max_register_size_per_work_group;
+  }
+  void set_device_id(uint32_t device_id) {
+    _device_id = device_id;
+  }
+  void set_uuid(std::array<unsigned char, 16> uuid) {
+    _uuid = std::move(uuid);
+  }
+  void set_global_mem_cache_size(unsigned int global_mem_cache_size) {
+    _global_mem_cache_size = global_mem_cache_size;
+  }
+
+private:
+  char _name[256];
+  int _max_work_item_sizes_i[3];
+  bool _host_unified_memory = false;
+  int _major;
+  int _minor;
+  int _integrated = 0;
+  int _frequency;
+  // Set estimated value 3200000 kHz as default value.
+  unsigned int _memory_clock_rate = 3200000;
+  // Set estimated value 64 bits as default value.
+  unsigned int _memory_bus_width = 64;
+  unsigned int _global_mem_cache_size;
+  int _max_compute_units;
+  int _max_work_group_size;
+  int _max_sub_group_size;
+  int _max_work_items_per_compute_unit;
+  int _max_register_size_per_work_group;
+  size_t _global_mem_size;
+  size_t _local_mem_size;
+  size_t _max_nd_range_size[3];
+  int _max_nd_range_size_i[3];
+  uint32_t _device_id;
+  std::array<unsigned char, 16> _uuid;
+};
+
+static int get_major_version(const sycl::device &dev) {
+  int major, minor;
+  detail::get_version(dev, major, minor);
+  return major;
+}
+
+static int get_minor_version(const sycl::device &dev) {
+  int major, minor;
+  detail::get_version(dev, major, minor);
+  return minor;
+}
+
+static void get_device_info(device_info &out, const sycl::device &dev) {
+  device_info prop;
+  prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
+
+  int major, minor;
+  detail::get_version(dev, major, minor);
+  prop.set_major_version(major);
+  prop.set_minor_version(minor);
+
+  prop.set_max_work_item_sizes(
+#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
+      // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
+      // is an enum class element
+      dev.get_info<sycl::info::device::max_work_item_sizes>());
+#else
+      // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
+      // an int
+      dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
+#endif
+  prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
+
+  prop.set_max_clock_frequency(
+      dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
+
+  prop.set_max_compute_units(
+      dev.get_info<sycl::info::device::max_compute_units>());
+  prop.set_max_work_group_size(
+      dev.get_info<sycl::info::device::max_work_group_size>());
+  prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
+  prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
+
+#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
+  if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) {
+    unsigned int tmp =
+        dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
+    if (tmp != 0)
+      prop.set_memory_clock_rate(1000 * tmp);
+  }
+  if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) {
+    prop.set_memory_bus_width(
+        dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
+  }
+  if (dev.has(sycl::aspect::ext_intel_device_id)) {
+    prop.set_device_id(
+        dev.get_info<sycl::ext::intel::info::device::device_id>());
+  }
+  if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) {
+    prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
+  }
+#elif defined(_MSC_VER) && !defined(__clang__)
+#pragma message("get_device_info: querying memory_clock_rate and \
+memory_bus_width are not supported by the compiler used. \
+Use 3200000 kHz as memory_clock_rate default value. \
+Use 64 bits as memory_bus_width default value.")
+#else
+#warning "get_device_info: querying memory_clock_rate and \
+memory_bus_width are not supported by the compiler used. \
+Use 3200000 kHz as memory_clock_rate default value. \
+Use 64 bits as memory_bus_width default value."
+#endif
+
+  size_t max_sub_group_size = 1;
+  std::vector<size_t> sub_group_sizes =
+      dev.get_info<sycl::info::device::sub_group_sizes>();
+
+  for (const auto &sub_group_size : sub_group_sizes) {
+    if (max_sub_group_size < sub_group_size)
+      max_sub_group_size = sub_group_size;
+  }
+
+  prop.set_max_sub_group_size(max_sub_group_size);
+
+  prop.set_max_work_items_per_compute_unit(
+      dev.get_info<sycl::info::device::max_work_group_size>());
+  int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+  prop.set_max_nd_range_size(max_nd_range_size);
+
+  // Estimates max register size per work group, feel free to update the value
+  // according to device properties.
+  prop.set_max_register_size_per_work_group(65536);
+
+  prop.set_global_mem_cache_size(
+      dev.get_info<sycl::info::device::global_mem_cache_size>());
+  out = prop;
+}
+
+/// dpct device extension
+class device_ext : public sycl::device {
+  typedef std::mutex mutex_type;
+
+public:
+  device_ext() : sycl::device(), _ctx(*this) {}
+  ~device_ext() {
+    std::lock_guard<mutex_type> lock(m_mutex);
+    clear_queues();
+  }
+  device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this) {
+    std::lock_guard<mutex_type> lock(m_mutex);
+    init_queues();
+  }
+
+  int is_native_atomic_supported() { return 0; }
+  int get_major_version() const {
+    return dpct::get_major_version(*this);
+  }
+
+  int get_minor_version() const {
+    return dpct::get_minor_version(*this);
+  }
+
+  int get_max_compute_units() const {
+    return get_device_info().get_max_compute_units();
+  }
+
+  /// Return the maximum clock frequency of this device in KHz.
+  int get_max_clock_frequency() const {
+    return get_device_info().get_max_clock_frequency();
+  }
+
+  int get_integrated() const { return get_device_info().get_integrated(); }
+
+  int get_max_sub_group_size() const {
+    return get_device_info().get_max_sub_group_size();
+  }
+
+  int get_max_register_size_per_work_group() const {
+    return get_device_info().get_max_register_size_per_work_group();
+  }
+
+  int get_max_work_group_size() const {
+    return get_device_info().get_max_work_group_size();
+  }
+
+  int get_mem_base_addr_align() const {
+    return get_info<sycl::info::device::mem_base_addr_align>();
+  }
+
+  size_t get_global_mem_size() const {
+    return get_device_info().get_global_mem_size();
+  }
+
+  /// Get the number of bytes of free and total memory on the SYCL device.
+  /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
+  /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
+  void get_memory_info(size_t &free_memory, size_t &total_memory) {
+#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
+    if (!has(sycl::aspect::ext_intel_free_memory)) {
+      std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
+      free_memory = 0;
+    } else {
+      free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
+    }
+#else
+    std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
+    free_memory = 0;
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma message("Querying the number of bytes of free memory is not supported")
+#else
+#warning "Querying the number of bytes of free memory is not supported"
+#endif
+#endif
+    total_memory = get_device_info().get_global_mem_size();
+  }
+
+  void get_device_info(device_info &out) const {
+    dpct::get_device_info(out, *this);
+  }
+
+  device_info get_device_info() const {
+    device_info prop;
+    dpct::get_device_info(prop, *this);
+    return prop;
+  }
+
+  void reset() {
+    std::lock_guard<mutex_type> lock(m_mutex);
+    clear_queues();
+    init_queues();
+  }
+
+  sycl::queue &in_order_queue() { return *_q_in_order; }
+
+  sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
+
+  sycl::queue &default_queue() {
+#ifdef DPCT_USM_LEVEL_NONE
+    return out_of_order_queue();
+#else
+    return in_order_queue();
+#endif // DPCT_USM_LEVEL_NONE
+  }
+
+  void queues_wait_and_throw() {
+    std::unique_lock<mutex_type> lock(m_mutex);
+    std::vector<std::shared_ptr<sycl::queue>> current_queues(
+        _queues);
+    lock.unlock();
+    for (const auto &q : current_queues) {
+      q->wait_and_throw();
+    }
+    // Guard the destruct of current_queues to make sure the ref count is safe.
+    lock.lock();
+  }
+
+  sycl::queue *create_queue(bool enable_exception_handler = false) {
+#ifdef DPCT_USM_LEVEL_NONE
+    return create_out_of_order_queue(enable_exception_handler);
+#else
+    return create_in_order_queue(enable_exception_handler);
+#endif // DPCT_USM_LEVEL_NONE
+  }
+
+  sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
+    std::lock_guard<mutex_type> lock(m_mutex);
+    return create_queue_impl(enable_exception_handler,
+                             sycl::property::queue::in_order());
+  }
+
+  sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
+    std::lock_guard<mutex_type> lock(m_mutex);
+    return create_queue_impl(enable_exception_handler);
+  }
+
+  void destroy_queue(sycl::queue *&queue) {
+    std::lock_guard<mutex_type> lock(m_mutex);
+    _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                  [=](const std::shared_ptr<sycl::queue> &q) -> bool {
+                                    return q.get() == queue;
+                                  }),
+                   _queues.end());
+    queue = nullptr;
+  }
+  void set_saved_queue(sycl::queue* q) {
+    std::lock_guard<mutex_type> lock(m_mutex);
+    _saved_queue = q;
+  }
+  sycl::queue *get_saved_queue() const {
+    std::lock_guard<mutex_type> lock(m_mutex);
+    return _saved_queue;
+  }
+  sycl::context get_context() const { return _ctx; }
+
+private:
+  void clear_queues() {
+    _queues.clear();
+    _q_in_order = _q_out_of_order = _saved_queue = nullptr;
+  }
+
+  void init_queues() {
+    _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
+    _q_out_of_order = create_queue_impl(true);
+    _saved_queue = &default_queue();
+  }
+
+  /// Caller should acquire resource \p m_mutex before calling this function.
+  template <class... Properties>
+  sycl::queue *create_queue_impl(bool enable_exception_handler,
+                                 Properties... properties) {
+    sycl::async_handler eh = {};
+    if (enable_exception_handler) {
+      eh = exception_handler;
+    }
+    _queues.push_back(std::make_shared<sycl::queue>(
+        _ctx, *this, eh,
+        sycl::property_list(
+#ifdef DPCT_PROFILING_ENABLED
+            sycl::property::queue::enable_profiling(),
+#endif
+            properties...)));
+
+    return _queues.back().get();
+  }
+
+  void get_version(int &major, int &minor) const {
+    detail::get_version(*this, major, minor);
+  }
+  sycl::queue *_q_in_order, *_q_out_of_order;
+  sycl::queue *_saved_queue;
+  sycl::context _ctx;
+  std::vector<std::shared_ptr<sycl::queue>> _queues;
+  mutable mutex_type m_mutex;
+};
+
+static inline unsigned int get_tid() {
+#if defined(__linux__)
+  return syscall(SYS_gettid);
+#elif defined(_WIN64)
+  return GetCurrentThreadId();
+#else
+#error "Only support Windows and Linux."
+#endif
+}
+
+/// device manager
+class dev_mgr {
+public:
+  device_ext &current_device() {
+    unsigned int dev_id=current_device_id();
+    check_id(dev_id);
+    return *_devs[dev_id];
+  }
+  device_ext &cpu_device() const {
+    std::lock_guard<std::recursive_mutex> lock(m_mutex);
+    if (_cpu_device == -1) {
+      throw std::runtime_error("no valid cpu device");
+    } else {
+      return *_devs[_cpu_device];
+    }
+  }
+  device_ext &get_device(unsigned int id) const {
+    std::lock_guard<std::recursive_mutex> lock(m_mutex);
+    check_id(id);
+    return *_devs[id];
+  }
+  unsigned int current_device_id() const {
+   std::lock_guard<std::recursive_mutex> lock(m_mutex);
+   auto it=_thread2dev_map.find(get_tid());
+   if(it != _thread2dev_map.end())
+      return it->second;
+    return DEFAULT_DEVICE_ID;
+  }
+
+/// Select device with a device ID.
+/// \param [in] id The id of the device which can
+/// be obtained through get_device_id(const sycl::device).
+  void select_device(unsigned int id) {
+    std::lock_guard<std::recursive_mutex> lock(m_mutex);
+    check_id(id);
+    _thread2dev_map[get_tid()]=id;
+  }
+  unsigned int device_count() { return _devs.size(); }
+
+  unsigned int get_device_id(const sycl::device &dev) {
+    unsigned int id = 0;
+    for(auto dev_item : _devs) {
+      if (*dev_item == dev) {
+        break;
+      }
+      id++;
+    }
+    return id;
+  }
+
+  template <class DeviceSelector>
+  std::enable_if_t<
+      std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
+  select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
+    sycl::device selected_device = sycl::device(selector);
+    unsigned int selected_device_id = get_device_id(selected_device);
+    select_device(selected_device_id);
+  }
+
+  /// Returns the instance of device manager singleton.
+  static dev_mgr &instance() {
+    static dev_mgr d_m;
+    return d_m;
+  }
+  dev_mgr(const dev_mgr &) = delete;
+  dev_mgr &operator=(const dev_mgr &) = delete;
+  dev_mgr(dev_mgr &&) = delete;
+  dev_mgr &operator=(dev_mgr &&) = delete;
+
+private:
+  mutable std::recursive_mutex m_mutex;
+  dev_mgr() {
+    sycl::device default_device =
+        sycl::device(sycl::default_selector_v);
+    _devs.push_back(std::make_shared<device_ext>(default_device));
+
+    std::vector<sycl::device> sycl_all_devs =
+        sycl::device::get_devices(sycl::info::device_type::all);
+    // Collect other devices except for the default device.
+    if (default_device.is_cpu())
+      _cpu_device = 0;
+    for (auto &dev : sycl_all_devs) {
+      if (dev == default_device) {
+        continue;
+      }
+      _devs.push_back(std::make_shared<device_ext>(dev));
+      if (_cpu_device == -1 && dev.is_cpu()) {
+        _cpu_device = _devs.size() - 1;
+      }
+    }
+  }
+  void check_id(unsigned int id) const {
+    if (id >= _devs.size()) {
+      throw std::runtime_error("invalid device id");
+    }
+  }
+  std::vector<std::shared_ptr<device_ext>> _devs;
+  /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
+  /// thread id in _thread2dev_map, which means default device should be used
+  /// for the current thread.
+  const unsigned int DEFAULT_DEVICE_ID = 0;
+  /// thread-id to device-id map.
+  std::map<unsigned int, unsigned int> _thread2dev_map;
+  int _cpu_device = -1;
+};
+
+/// Util function to get the default queue of current selected device depends on
+/// the USM config. Return the default out-of-ordered queue when USM-none is
+/// enabled, otherwise return the default in-ordered queue.
+static inline sycl::queue &get_default_queue() {
+  return dev_mgr::instance().current_device().default_queue();
+}
+
+/// Util function to get the default in-ordered queue of current device in
+/// dpct device manager.
+static inline sycl::queue &get_in_order_queue() {
+  return dev_mgr::instance().current_device().in_order_queue();
+}
+
+/// Util function to get the default out-of-ordered queue of current device in
+/// dpct device manager.
+static inline sycl::queue &get_out_of_order_queue() {
+  return dev_mgr::instance().current_device().out_of_order_queue();
+}
+
+/// Util function to get the id of current device in
+/// dpct device manager.
+static inline unsigned int get_current_device_id() {
+  return dev_mgr::instance().current_device_id();
+}
+
+/// Util function to get the current device.
+static inline device_ext &get_current_device() {
+  return dev_mgr::instance().current_device();
+}
+
+/// Util function to get a device by id.
+static inline device_ext &get_device(unsigned int id) {
+  return dev_mgr::instance().get_device(id);
+}
+
+/// Util function to get the context of the default queue of current
+/// device in dpct device manager.
+static inline sycl::context get_default_context() {
+  return dpct::get_current_device().get_context();
+}
+
+/// Util function to get a CPU device.
+static inline device_ext &cpu_device() {
+  return dev_mgr::instance().cpu_device();
+}
+
+static inline unsigned int select_device(unsigned int id) {
+  dev_mgr::instance().select_device(id);
+  return id;
+}
+
+template <class DeviceSelector>
+static inline std::enable_if_t<
+    std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
+select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
+  dev_mgr::instance().select_device(selector);
+}
+
+static inline unsigned int get_device_id(const sycl::device &dev){
+  return dev_mgr::instance().get_device_id(dev);
+}
+
+/// Util function to check whether a device supports some kinds of sycl::aspect.
+inline void
+has_capability_or_fail(const sycl::device &dev,
+                       const std::initializer_list<sycl::aspect> &props) {
+  for (const auto &it : props) {
+    if (dev.has(it))
+      continue;
+    switch (it) {
+    case sycl::aspect::fp64:
+      throw std::runtime_error("'double' is not supported in '" +
+                               dev.get_info<sycl::info::device::name>() +
+                               "' device");
+      break;
+    case sycl::aspect::fp16:
+      throw std::runtime_error("'half' is not supported in '" +
+                               dev.get_info<sycl::info::device::name>() +
+                               "' device");
+      break;
+    default:
+#define __SYCL_ASPECT(ASPECT, ID)                                              \
+  case sycl::aspect::ASPECT:                                                   \
+    return #ASPECT;
+#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
+#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
+      auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string {
+        switch (AspectNum) {
+#include <sycl/info/aspects.def>
+#include <sycl/info/aspects_deprecated.def>
+        default:
+          return "unknown aspect";
+        }
+      };
+#undef __SYCL_ASPECT_DEPRECATED_ALIAS
+#undef __SYCL_ASPECT_DEPRECATED
+#undef __SYCL_ASPECT
+      throw std::runtime_error(
+          "'" + getAspectNameStr(it) + "' is not supported in '" +
+          dev.get_info<sycl::info::device::name>() + "' device");
+    }
+    break;
+  }
+}
+} // namespace dpct
+
+#endif // __DPCT_DEVICE_HPP__
diff --git a/dpct/dnnl_utils.hpp b/dpct/dnnl_utils.hpp
new file mode 100644
index 0000000000000..caf5a768b77e2
--- /dev/null
+++ b/dpct/dnnl_utils.hpp
@@ -0,0 +1,4921 @@
+//==---- dnnl_utils.hpp ---------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_DNNL_UTILS_HPP__
+#define __DPCT_DNNL_UTILS_HPP__
+
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/numeric>
+#include <oneapi/mkl.hpp>
+#include <oneapi/mkl/rng/device.hpp>
+#include <sycl/sycl.hpp>
+#include <oneapi/dnnl/dnnl.hpp>
+#include <oneapi/dnnl/dnnl_sycl.hpp>
+#include <unordered_map>
+#include <algorithm>
+#include <list>
+
+#include "memory.hpp"
+#include "device.hpp"
+#include "lib_common_utils.hpp"
+
+namespace dpct {
+namespace dnnl {
+/// Get concatenated library version as an integer.
+static inline size_t get_version() {
+  const ::dnnl::version_t *ver = ::dnnl::version();
+  return ver->major * 1000 + ver->minor * 100 + ver->patch;
+}
+class engine_ext;
+typedef oneapi::mkl::rng::philox4x32x10 rng_engine_t;
+/// An enum class representing memory layout. Used by
+/// memory_desc_ext to create a memory with pre-defined layout.
+enum class memory_format_tag { nchw, nhwc, nchw_blocked };
+
+/// An enum class representing RNN data memory layout. Used by
+/// memory_desc_ext to create a memory with pre-defined layout.
+enum class rnn_memory_format_tag { tnc, ntc };
+
+/// A class holding the description of an N-dimensions memory.
+class memory_desc_ext {
+  ::dnnl::memory::desc _desc;
+public:
+  /// Convert dpct::library_data_t to dnnl::memory::data_type.
+  static ::dnnl::memory::data_type to_dnnl_data_type(dpct::library_data_t dt);
+  /// Convert dnnl::memory::data_type to dpct::library_data_t.
+  static dpct::library_data_t
+  to_dpct_library_data_t(::dnnl::memory::data_type dt, unsigned block_size);
+  /// Convert dpct::dnnl::memory_format_tag to dnnl::memory::format_tag.
+  static ::dnnl::memory::format_tag to_dnnl_format_tag(dpct::library_data_t dt,
+                                                       memory_format_tag tag);
+  memory_desc_ext() = default;
+  memory_desc_ext(::dnnl::memory::desc &desc) : _desc(desc) {}
+  memory_desc_ext(::dnnl::memory::desc &&desc) : _desc(std::move(desc)) {}
+  /// Setting a 4D memory with given parameters.
+  /// \param [in] tag Format tag.
+  /// \param [in] dt Data type.
+  /// \param [in] n Number of images.
+  /// \param [in] c Number of channels.
+  /// \param [in] h Height of images.
+  /// \param [in] w Width of images.
+  void set(memory_format_tag tag, dpct::library_data_t dt, int n, int c, int h,
+           int w);
+  /// Setting a 3D RNN data memory with given parameters.
+  /// \param [in] tag RNN data format tag.
+  /// \param [in] dt Data type.
+  /// \param [in] t Number of sequence length.
+  /// \param [in] n Number of batch.
+  /// \param [in] c Height of input channel.
+  void set(rnn_memory_format_tag tag, dpct::library_data_t dt, int t, int n, int c);
+  /// Setting a 4D memory with given parameters.
+  /// \param [in] dt Data type.
+  /// \param [in] n Number of images.
+  /// \param [in] c Number of channels.
+  /// \param [in] h Height of images.
+  /// \param [in] w Width of images.
+  /// \param [in] n_stride Stride between two continuous images.
+  /// \param [in] c_stride Stride between two continuous channels.
+  /// \param [in] h_stride Stride between two continuous rows.
+  /// \param [in] w_stride Stride between two continuous columns.
+  void set(dpct::library_data_t dt, int n, int c, int h, int w, int n_stride,
+           int c_stride, int h_stride, int w_stride);
+  /// Setting a ND memory with given parameters.
+  /// \param [in] dt Data type.
+  /// \param [in] ndims Dimension of the memory.
+  /// \param [in] dims Array of dimension ndims that contain the size of each
+  /// memory dimension. \param [in] strides Array of dimension ndims that
+  /// contain the stride of each memory dimension.
+  void set(dpct::library_data_t dt, int ndims, const int dims[],
+           const int strides[]);
+  /// Setting a ND memory with given parameters.
+  /// \param [in] tag Format tag.
+  /// \param [in] dt Data type.
+  /// \param [in] ndims Dimension of the memory.
+  /// \param [in] dims Array of dimension ndims that contain the size of each
+  /// memory dimension.
+  void set(memory_format_tag tag, dpct::library_data_t dt, int ndims,
+           const int dims[]);
+  /// Getting a ::dnnl::memory::desc from a memory_desc_ext.
+  /// \returns The ::dnnl::memory::desc.
+  const ::dnnl::memory::desc &get_desc() const { return _desc; }
+  /// Setting holding desc with given dnnl memory descriptor.
+  void set_desc(::dnnl::memory::desc desc) { _desc = desc; }
+  /// Getting a size of a memory_desc_ext in bytes.
+  /// \returns The size.
+  size_t get_size() const { return _desc.get_size(); }
+  /// Getting parameters from a 4D memory.
+  /// \param [out] dt Data type.
+  /// \param [out] n Number of images.
+  /// \param [out] c Number of channels.
+  /// \param [out] h Height of images.
+  /// \param [out] w Width of images.
+  /// \param [out] n_stride Stride between two continuous images.
+  /// \param [out] c_stride Stride between two continuous channels.
+  /// \param [out] h_stride Stride between two continuous rows.
+  /// \param [out] w_stride Stride between two continuous columns.
+  void get(dpct::library_data_t *dt, int *n, int *c, int *h, int *w,
+           int *n_stride, int *c_stride, int *h_stride, int *w_stride) const;
+  /// Getting parameters from a 4D memory.
+  /// \param [out] dt Data type.
+  /// \param [out] tag Format tag.
+  /// \param [out] n Number of images.
+  /// \param [out] c Number of channels.
+  /// \param [out] h Height of images.
+  /// \param [out] w Width of images.
+  void get(dpct::library_data_t *dt, memory_format_tag *tag, int *n, int *c,
+           int *h, int *w) const;
+  /// Getting parameters from a 3D RNN data memory.
+  /// \param [out] dt Data type.
+  /// \param [out] tag RNN data format tag.
+  /// \param [out] t Number of sequence length.
+  /// \param [out] n Number of batch.
+  /// \param [out] c Height of input channel.
+  void get(dpct::library_data_t *dt, rnn_memory_format_tag *tag, int *t, int *n,
+           int *c) const;
+  /// Getting parameters from a ND memory.
+  /// \param [in] requested_ndims Requested number of dimensions to get from a
+  /// given memory descriptor.
+  /// \param [out] dt Data type.
+  /// \param [out] ndims Dimension of the memory.
+  /// \param [out] dims Array of dimension requested_ndims that contain the 
+  /// size of each memory dimension.
+  /// \param [out] strides Array of dimension requested_ndims that contain the
+  /// stride of each memory dimension.
+  void get(int requested_ndims, dpct::library_data_t *dt, int *ndims,
+           int dims[], int strides[]) const;
+  /// Getting parameters from a ND memory.
+  /// \param [in] requested_ndims Requested number of dimensions to get from a
+  /// given memory descriptor.
+  /// \param [out] dt Data type.
+  /// \param [out] tag Format tag.
+  /// \param [out] ndims Dimension of the memory.
+  /// \param [out] dims Array of dimension requested_ndims that contain the 
+  /// size of each memory dimension.
+  void get(int requested_ndims, dpct::library_data_t *dt,
+           memory_format_tag *tag, int *ndims, int dims[]) const;
+  /// Getting dims from a ND memory.
+  /// \return The dims.
+  std::vector<int64_t> get_dims() const { return _desc.get_dims(); }
+  /// Getting strides from a ND memory.
+  /// \return The strides.
+  std::vector<int64_t> get_strides() const {
+    return _desc.get_strides();
+  }
+  /// Getting element num from a ND memory.
+  /// \return The element number.
+  size_t get_element_num() const {
+    auto dims = _desc.get_dims();
+    if (dims.empty()) {
+      return 0;
+    }
+    size_t result = 1;
+    for (auto &dim : dims) {
+      result *= dim;
+    }
+    return result;
+  }
+
+  operator bool() const {
+    return bool(_desc);
+  }
+
+  memory_desc_ext &operator=(std::nullptr_t) {
+    _desc.reset(nullptr);
+    return *this;
+  }
+};
+
+/// A class holding description for an activation operation.
+class activation_desc {
+  ::dnnl::algorithm _alg;
+  float _alpha;
+  float _beta;
+
+public:
+  /// Setting an activation descriptor with given parameters.
+  /// \param [in] alg Activation algorithm.
+  /// \param [in] alpha Value of alpha parameter.
+  void set(::dnnl::algorithm alg, float alpha) {
+    _alg = alg;
+    if(alg == ::dnnl::algorithm::eltwise_clip) {
+      _alpha = 0;
+      _beta = alpha;
+    } else {
+      _alpha = alpha;
+    }
+  }
+  /// Getting parameters form an activation descriptor.
+  /// \param [out] alg Activation algorithm.
+  /// \param [out] alpha Value of alpha parameter.
+  void get(::dnnl::algorithm *alg, float *alpha) const {
+    *alg = _alg;
+    if(_alg == ::dnnl::algorithm::eltwise_clip) {
+      *alpha = _beta;
+    } else {
+      *alpha = _alpha;
+    }
+  }
+  /// Setting the alpha parameter of an activation descriptor.
+  /// \param [in] alpha Value of alpha parameter.
+  void set_alpha(float alpha) { _alpha = alpha; }
+  /// Setting the beta parameter of an activation descriptor.
+  /// \param [in] beta Value of beta parameter.
+  void set_beta(float beta) { _beta = beta; }
+  /// Setting the algorithm parameter of an activation descriptor.
+  /// \param [in] alg Activation algorithm.
+  void set_algorithm(::dnnl::algorithm alg) { _alg = alg; }
+  /// Getting the alpha parameter from an activation descriptor.
+  /// \param [out] alpha Value of alpha parameter.
+  float get_alpha() const { return _alpha; }
+  /// Getting the beta parameter from an activation descriptor.
+  /// \param [out] beta Value of beta parameter.
+  float get_beta() const { return _beta; }
+  /// Getting the algorithm parameter from an activation descriptor.
+  /// \param [out] alg Activation algorithm.
+  ::dnnl::algorithm get_algorithm() const { return _alg; }
+};
+
+/// A class holding description for a local response normalization operation.
+class lrn_desc {
+  unsigned int _local_size;
+  float _alpha;
+  float _beta;
+  float _k;
+
+public:
+  /// Setting a local response normalization descriptor with given parameters.
+  /// \param [in] local_size Value of local_size parameter.
+  /// \param [in] alpha Value of alpha parameter.
+  /// \param [in] beta Value of beta parameter.
+  /// \param [in] k Value of k parameter.
+  void set(unsigned int local_size, float alpha, float beta, float k) {
+    _local_size = local_size;
+    _alpha = alpha;
+    _beta = beta;
+    _k = k;
+  }
+  /// Getting parameters form a local response normalization descriptor.
+  /// \param [out] local_size Value of local_size parameter.
+  /// \param [out] alpha Value of alpha parameter.
+  /// \param [out] beta Value of beta parameter.
+  /// \param [out] k Value of k parameter.
+  void get(unsigned int *local_size, float *alpha, float *beta,
+           float *k) const {
+    *local_size = _local_size;
+    *alpha = _alpha;
+    *beta = _beta;
+    *k = _k;
+  }
+  /// Setting the local size parameter of a local response normalization
+  /// descriptor.
+  /// \param [in] local_size Value of local_size parameter.
+  void set_local_size(unsigned int local_size) { _local_size = local_size; }
+  /// Setting the alpha parameter of a local response normalization descriptor.
+  /// \param [in] alpha Value of alpha parameter.
+  void set_alpha(float alpha) { _alpha = alpha; }
+  /// Setting the beta parameter of a local response normalization descriptor.
+  /// \param [in] beta Value of beta parameter.
+  void set_beta(float beta) { _beta = beta; }
+  /// Setting the k parameter of a local response normalization descriptor.
+  /// \param [in] k Value of k parameter.
+  void set_k(float k) { _k = k; }
+  /// Getting the local size parameter from a local response normalization
+  /// descriptor.
+  /// \param [out] local_size Value of local_size parameter.
+  unsigned int get_local_size() const { return _local_size; }
+  /// Getting the alpha parameter from a local response normalization
+  /// descriptor.
+  /// \param [out] alpha Value of alpha parameter.
+  float get_alpha() const { return _alpha; }
+  /// Getting the beta parameter from a local response normalization descriptor.
+  /// \param [out] beta Value of beta parameter.
+  float get_beta() const { return _beta; }
+  /// Getting the k parameter from a local response normalization descriptor.
+  /// \param [out] k Value of k parameter.
+  float get_k() const { return _k; }
+};
+
+/// An enum class representing softmax algorithm.
+enum class softmax_algorithm { normal, log };
+/// An enum class representing softmax mode.
+enum class softmax_mode { instance, channel };
+
+/// A class holding description for a pooling operation.
+class pooling_desc {
+  ::dnnl::algorithm _alg;
+  std::vector<int64_t> _stride;
+  std::vector<int64_t> _kernel;
+  std::vector<int64_t> _padding;
+
+public:
+  /// Setting a 2D pooling descriptor with given parameters.
+  /// \param [in] alg Pooling algorithm.
+  /// \param [in] kernel_h Value of height of kernel.
+  /// \param [in] kernel_w Value of width of kernel.
+  /// \param [in] padding_h Value of height of padding.
+  /// \param [in] padding_w Value of width of padding.
+  /// \param [in] stride_h Value of height of stride.
+  /// \param [in] stride_w Value of width of stride.
+  void set(::dnnl::algorithm alg, int kernel_h, int kernel_w, int padding_h,
+           int padding_w, int stride_h, int stride_w) {
+    _alg = alg;
+    _stride = {stride_h, stride_w};
+    _kernel = {kernel_h, kernel_w};
+    _padding = {padding_h, padding_w};
+  }
+  /// Setting a ND pooling descriptor with given parameters.
+  /// \param [in] alg Pooling algorithm.
+  /// \param [in] ndims Dimension of the pooling operation.
+  /// \param [in] kernel Array of dimension ndims containing the kernel size of
+  /// each dimension.
+  /// \param [in] padding Array of dimension ndims containing the padding size of
+  /// each dimension.
+  /// \param [in] stride Array of dimension ndims containing the stride size of
+  /// each dimension.
+  void set(::dnnl::algorithm alg, int ndims, int kernel[], int padding[],
+           int stride[]) {
+    _alg = alg;
+    _stride = std::vector<int64_t>(stride, stride + ndims);
+    _kernel = std::vector<int64_t>(kernel, kernel + ndims);
+    _padding = std::vector<int64_t>(padding, padding + ndims);
+  }
+  /// Getting parameters from a 2D pooling descriptor.
+  /// \param [out] alg Pooling algorithm.
+  /// \param [out] kernel_h Value of height of kernel.
+  /// \param [out] kernel_w Value of width of kernel.
+  /// \param [out] padding_h Value of height of padding.
+  /// \param [out] padding_w Value of width of padding.
+  /// \param [out] stride_h Value of height of stride.
+  /// \param [out] stride_w Value of width of stride.
+  void get(::dnnl::algorithm *alg, int *kernel_h, int *kernel_w, int *padding_h,
+           int *padding_w, int *stride_h, int *stride_w) const {
+    *alg = _alg;
+    *kernel_h = _kernel[0];
+    *kernel_w = _kernel[1];
+    *padding_h = _padding[0];
+    *padding_w = _padding[1];
+    *stride_h = _stride[0];
+    *stride_w = _stride[1];
+  }
+  /// Getting parameters from a ND pooling descriptor.
+  /// \param [in] requested_ndims Requested number of dimensions to get from a
+  /// given pooling descriptor.
+  /// \param [out] alg Pooling algorithm.
+  /// \param [out] ndims Dimension of the pooling operation.
+  /// \param [out] kernel Array of dimension ndims containing the kernel size of
+  /// each dimension.
+  /// \param [out] padding Array of dimension ndims containing the padding size
+  /// of each dimension.
+  /// \param [out] stride Array of dimension ndims containing the stride size of
+  /// each dimension.
+  void get(int requested_ndims, ::dnnl::algorithm *alg, int *ndims,
+           int kernel[], int padding[], int stride[]) const {
+    *alg = _alg;
+    *ndims = _stride.size();
+    for (int i = 0; i < requested_ndims; i++) {
+      kernel[i] = _kernel[i];
+      padding[i] = _padding[i];
+      stride[i] = _stride[i];
+    }
+  }
+  /// Setting the algorithm parameter of a pooling descriptor.
+  /// \param [in] alg Pooling algorithm.
+  void set_algorithm(::dnnl::algorithm alg) { _alg = alg; }
+  /// Setting the stride parameter of a pooling descriptor.
+  /// \param [in] stride Array of dimension ndims containing the stride size of
+  /// each dimension.
+  void set_stride(const std::vector<int64_t> &stride) { _stride = stride; }
+  /// Setting the kernel parameter of a pooling descriptor.
+  /// \param [in] kernel Array of dimension ndims containing the kernel size of
+  /// each dimension.
+  void set_kernel(const std::vector<int64_t> &kernel) { _kernel = kernel; }
+  /// Setting the padding parameter of a pooling descriptor.
+  /// \param [in] padding Array of dimension ndims containing the padding size
+  /// of each dimension.
+  void set_padding(const std::vector<int64_t> &padding) { _padding = padding; }
+
+  /// Getting the algorithm parameter from a pooling descriptor.
+  /// \param [out] alg Pooling algorithm.
+  ::dnnl::algorithm get_algorithm() const { return _alg; }
+  /// Getting the stride parameter from a pooling descriptor.
+  /// \returns Array of dimension ndims containing the stride size of each
+  /// dimension.
+  const std::vector<int64_t> &get_stride() const { return _stride; }
+  /// Getting the kernel parameter from a pooling descriptor.
+  /// \returns Array of dimension ndims containing the kernel size of each
+  /// dimension.
+  const std::vector<int64_t> &get_kernel() const { return _kernel; }
+  /// Getting the padding parameter from a pooling descriptor.
+  /// \returns Array of dimension ndims containing the padding size of each
+  /// dimension.
+  const std::vector<int64_t> &get_padding() const { return _padding; }
+  /// Getting the output dimensions of a memory after 2D pooling has been
+  /// applied.
+  /// \param [in] desc Input memory descriptor.
+  /// \param [out] out_n Number of images.
+  /// \param [out] out_c Number of channels.
+  /// \param [out] out_h Height of images.
+  /// \param [out] out_w Width of images.
+  void get_forward_output_dim(const memory_desc_ext &desc, int *out_n,
+                              int *out_c, int *out_h, int *out_w) const {
+    auto dims = desc.get_dims();
+    *out_n = dims[0];
+    *out_c = dims[1];
+    *out_h = 1 + (dims[2] + 2 * _padding[0] - _kernel[0]) / _stride[0];
+    *out_w = 1 + (dims[3] + 2 * _padding[1] - _kernel[1]) / _stride[1];
+  }
+  /// Getting the output dimensions of a memory after ND pooling has been
+  /// applied.
+  /// \param [in] desc Input memory descriptor.
+  /// \param [out] ndims Dimension of the memory.
+  /// \param [out] out_dims Array of dimension requested_ndims that contain
+  /// the size of each memory dimension.
+  void get_forward_output_dim(const memory_desc_ext &desc, int ndims,
+                              int out_dims[]) const {
+    assert(ndims >= 4 && "ndims is at least 4.");
+    auto dims = desc.get_dims();
+    out_dims[0] = dims[0];
+    out_dims[1] = dims[1];
+    for (int i = 2; i < ndims; i++) {
+      out_dims[i] =
+          1 + (dims[i] + 2 * _padding[i - 2] - _kernel[i - 2]) / _stride[i - 2];
+    }
+  }
+};
+
+/// An enum class representing reduction operations.
+enum class reduction_op {
+  max,
+  min,
+  sum,
+  mul,
+  mean,
+  amax,
+  mul_no_zeros,
+  norm1,
+  norm2
+};
+
+/// An enum class representing batch normalization mode.
+enum class batch_normalization_mode { per_activation, spatial };
+
+/// An enum class representing batch normalization operations.
+enum class batch_normalization_ops { none, activation, add_activation };
+
+/// An enum class representing binary operations.
+enum class binary_op { add, sub, mul, div, min, max, sqrt, neg };
+
+/// An struct representing convolution algorithm infomation.
+struct convolution_algorithm_info {
+  ::dnnl::algorithm algo = ::dnnl::algorithm::convolution_auto;
+  int status = 0;
+};
+
+/// A class holding description for a convolution operation.
+class convolution_desc {
+  std::vector<int64_t> _strides;
+  std::vector<int64_t> _dilates;
+  std::vector<int64_t> _paddings;
+  int _group_count = 1;
+  ::dnnl::fpmath_mode _math_mode = ::dnnl::fpmath_mode::strict;
+public:
+  /// Setting a group count to be used in the convolution.
+  /// \param [in] group_count Value of group count.
+  void set_group_count(int group_count) { _group_count = group_count; }
+  /// Getting a group count specified in the given convolution descriptor.
+  /// \returns Value of group count.
+  int get_group_count() { return _group_count; }
+  /// Setting floating point math mode to be used in the convolution.
+  /// \param [in] math_mode Value of math_mode.
+  void set_math_mode(::dnnl::fpmath_mode math_mode) { _math_mode = math_mode; }
+  /// Getting floating point math mode specified in the given convolution descriptor.
+  /// \returns Value of math mode.
+  ::dnnl::fpmath_mode get_math_mode() { return _math_mode; }
+  /// Setting a 2D convolution descriptor with given parameters.
+  /// \param [in] padding_h Value of height of padding.
+  /// \param [in] padding_w Value of width of padding.
+  /// \param [in] stride_h Value of height of stride.
+  /// \param [in] stride_w Value of width of stride.
+  /// \param [in] dilate_h Value of height of dilate.
+  /// \param [in] dilate_w Value of width of dilate.
+  void set(int padding_h, int padding_w, int stride_h, int stride_w,
+           int dilate_h, int dilate_w) {
+    _strides = {stride_h, stride_w};
+    _dilates = {dilate_h - 1, dilate_w - 1};
+    _paddings = {padding_h, padding_w};
+  }
+  /// Setting a ND convolution descriptor with given parameters.
+  /// \param [in] ndims Dimension of the convolution operation.
+  /// \param [in] paddings Array of dimension ndims containing the padding size of
+  /// each dimension.
+  /// \param [in] strides Array of dimension ndims containing the stride size of
+  /// each dimension.
+  /// \param [in] dilates Array of dimension ndims containing the kernel size of
+  /// each dimension.
+  void set(int ndims, int paddings[], int strides[], int dilates[]) {
+    _strides = std::vector<int64_t>(strides, strides + ndims);
+    _paddings = std::vector<int64_t>(paddings, paddings + ndims);
+    _dilates = std::vector<int64_t>(dilates, dilates + ndims);
+    for (auto &dilate : _dilates) {
+      dilate--;
+    }
+  }
+  /// Getting parameters from a 2D convolution descriptor.
+  /// \param [out] padding_h Value of height of padding.
+  /// \param [out] padding_w Value of width of padding.
+  /// \param [out] stride_h Value of height of stride.
+  /// \param [out] stride_w Value of width of stride.
+  /// \param [out] dilate_h Value of height of dilate.
+  /// \param [out] dilate_w Value of width of dilate.
+  void get(int *padding_h, int *padding_w, int *stride_h, int *stride_w,
+           int *dilate_h, int *dilate_w) const {
+    *dilate_h = _dilates[0];
+    *dilate_w = _dilates[1];
+    *padding_h = _paddings[0];
+    *padding_w = _paddings[1];
+    *stride_h = _strides[0];
+    *stride_w = _strides[1];
+  }
+  /// Getting parameters from a ND convolution descriptor.
+  /// \param [in] requested_ndims Requested number of dimensions to get from a
+  /// given convolution descriptor.
+  /// \param [out] ndims Dimension of the pooling operation.
+  /// \param [out] paddings Array of dimension ndims containing the padding size
+  /// of each dimension.
+  /// \param [out] strides Array of dimension ndims containing the stride size of
+  /// each dimension.
+  /// \param [out] dilates Array of dimension ndims containing the dilate size of
+  /// each dimension.
+  void get(int requested_ndims, int *ndims, int paddings[], int strides[],
+           int dilates[]) const {
+    *ndims = _strides.size();
+    for (int i = 0; i < requested_ndims; i++) {
+      dilates[i] = _dilates[i];
+      paddings[i] = _paddings[i];
+      strides[i] = _strides[i];
+    }
+  }
+  /// Getting the stride parameter from a convolution descriptor.
+  /// \returns Array of dimension ndims containing the stride size of each
+  /// dimension.
+  const std::vector<int64_t> &get_stride() const { return _strides; }
+  /// Getting the kernel parameter from a convolution descriptor.
+  /// \returns Array of dimension ndims containing the dilate size of each
+  /// dimension.
+  const std::vector<int64_t> &get_dilate() const { return _dilates; }
+  /// Getting the padding parameter from a convolution descriptor.
+  /// \returns Array of dimension ndims containing the padding size of each
+  /// dimension.
+  const std::vector<int64_t> &get_padding() const { return _paddings; }
+  /// Getting the output dimensions of a memory after 2D convolution has been
+  /// applied.
+  /// \param [in] desc Input memory descriptor.
+  /// \param [in] weight_desc Input weight memory descriptor.
+  /// \param [out] out_n Number of images.
+  /// \param [out] out_c Number of channels.
+  /// \param [out] out_h Height of images.
+  /// \param [out] out_w Width of images.
+  void get_forward_output_dim(const memory_desc_ext &desc,
+                              const memory_desc_ext &weight_desc, int *out_n,
+                              int *out_c, int *out_h, int *out_w) const {
+    auto dims = desc.get_dims();
+    auto weight_dims = weight_desc.get_dims();
+    *out_n = dims[0];
+    *out_c = weight_dims[0];
+    *out_h = 1 + (dims[2] + 2 * _paddings[0] -
+                  (1 + (_dilates[0] * (weight_dims[2] - 1)))) /
+                     _strides[0];
+    *out_w = 1 + (dims[3] + 2 * _paddings[1] -
+                  (1 + (_dilates[1] * (weight_dims[3] - 1)))) /
+                     _strides[1];
+  }
+  /// Getting the output dimensions of a memory after ND convolution has been
+  /// applied.
+  /// \param [in] desc Input memory descriptor.
+  /// \param [in] weight_desc Input weight memory descriptor.
+  /// \param [out] ndims Dimension of the memory.
+  /// \param [out] out_dims Array of dimension requested_ndims that contain
+  /// the size of each memory dimension.
+  void get_forward_output_dim(const memory_desc_ext &desc,
+                              const memory_desc_ext &weight_desc, int ndims,
+                              int out_dims[]) const {
+    assert(ndims >= 4 && "ndims is at least 4.");
+    auto dims = desc.get_dims();
+    auto weight_dims = weight_desc.get_dims();
+    out_dims[0] = dims[0];
+    out_dims[1] = weight_dims[1];
+    for (int i = 2; i < ndims; i++) {
+      out_dims[i] = 1 + (dims[i] + 2 * _paddings[i - 2] -
+                         (1 + (_dilates[i - 2] * (weight_dims[i] - 1)))) /
+                            _strides[i - 2];
+    }
+  }
+
+  convolution_desc &operator=(std::nullptr_t) {
+    return *this = convolution_desc();
+  }
+
+  operator bool() const {
+    return !(_strides.size() == 0
+             && _dilates.size() == 0
+             && _paddings.size() == 0);
+  }
+};
+
+/// An enum class representing rnn mode.
+enum class rnn_mode { vanilla_relu, vanilla_tanh, lstm, gru };
+
+/// An enum class representing rnn bias mode.
+enum class rnn_bias_mode { none, single };
+
+/// An enum class representing rnn direction.
+enum class rnn_direction {unidirectional, bidirectional};
+
+/// A class holding description for a RNN operation.
+class rnn_desc {
+  rnn_mode _mode;
+  rnn_bias_mode _bias_mode;
+  rnn_direction _direction;
+  dpct::library_data_t _dt;
+  int _input_size;
+  int _hidden_size;
+  int _projection_size;
+  int _layer_size;
+
+public:
+  void set(rnn_mode mode, rnn_bias_mode bias_mode, rnn_direction direction,
+           dpct::library_data_t dt, int input_size, int hidden_size,
+           int projection_size, int layer_size) {
+    _mode = mode;
+    _bias_mode = bias_mode;
+    _direction = direction;
+    _input_size = input_size;
+    _hidden_size = hidden_size;
+    _projection_size = projection_size;
+    _layer_size = layer_size;
+    _dt = dt;
+  }
+  void get(rnn_mode *mode, rnn_bias_mode *bias_mode, rnn_direction *direction,
+           dpct::library_data_t *dt, int *input_size, int *hidden_size,
+           int *projection_size, int *layer_size) const {
+    *mode = _mode;
+    *bias_mode = _bias_mode;
+    *direction = _direction;
+    *input_size = _input_size;
+    *hidden_size = _hidden_size;
+    *projection_size = _projection_size;
+    *layer_size = _layer_size;
+    *dt = _dt;
+  }
+};
+
+/// A class holding description for a Dropout operation.
+class dropout_desc {
+  struct dropout_desc_imp {
+    float _p = 0.5f;
+    unsigned long long _seed = 1;
+    void *_state = nullptr;
+    std::vector<std::uint8_t> _host_state;
+    rng_engine_t _rng_engine;
+    dropout_desc_imp() : _rng_engine(dpct::get_default_queue(), 1) {}
+  };
+  std::shared_ptr<dropout_desc_imp> _imp;
+
+  void generate(sycl::queue *q, std::int64_t required_state_size,
+                std::int64_t num, void *buffer) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
+                             "Interfaces Project does not support this API.");
+#else
+    sycl::event e_gen = oneapi::mkl::rng::generate(
+        oneapi::mkl::rng::bernoulli<std::int32_t>(1.f - _imp->_p),
+        _imp->_rng_engine, num, (std::int32_t *)buffer);
+    sycl::event e_save = q->submit([&](sycl::handler &cgh) {
+      cgh.depends_on(e_gen);
+      cgh.host_task([=] {
+        oneapi::mkl::rng::save_state(_imp->_rng_engine,
+                                     _imp->_host_state.data());
+      });
+    });
+    q->memcpy(_imp->_state, _imp->_host_state.data(), required_state_size,
+              e_save);
+#endif
+  }
+public:
+  operator bool() const {
+    return bool(_imp);
+  }
+  dropout_desc &operator=(std::nullptr_t) {
+    _imp.reset();
+    return *this;
+  }
+  /// Initializing a dropout descriptor.
+  void init(){
+    _imp = std::make_shared<dropout_desc_imp>();
+  }
+  /// Setting a dropout descriptor with given parameters.
+  /// \param [in] engine Engine of the dropout operation.
+  /// \param [in] p Probability of value set to zero.
+  /// \param [in] state Memory that store random generator state.
+  /// \param [in] state_size Required size to store random generator state.
+  /// \param [in] seed Seed to initialize conditions of the generator state.
+  void set(engine_ext &engine, float p, void *state, size_t state_size,
+           unsigned long long seed);
+  /// Getting parameters from a dropout descriptor.
+  /// \param [in] engine Engine of the dropout operation.
+  /// \param [in] p Probability of value set to zero.
+  /// \param [in] state Memory that store random generator state.
+  /// \param [in] seed Seed to initialize conditions of the generator state.
+  void get(float *p, void **states, unsigned long long *seed) const noexcept {
+    *seed = _imp->_seed;
+    *states = _imp->_state;
+    *p = _imp->_p;
+  }
+  /// Getting the probability of value set to zero.
+  /// \returns Probability.
+  float get_probability() const noexcept { return _imp->_p; }
+  /// Restoreing a dropout descriptor from stored state.
+  /// \param [in] engine Engine of the dropout operation.
+  /// \param [in] p Probability of value set to zero.
+  /// \param [in] state Memory that store random generator state.
+  /// \param [in] state_size Required size to store random generator state.
+  /// \param [in] seed Seed to initialize conditions of the generator state.
+  void restore(engine_ext &engine, float p, void *state, size_t state_size,
+               unsigned long long seed);
+  friend class engine_ext;
+};
+
+namespace detail {
+typedef std::string primitive_cache_key_type;
+typedef std::list<primitive_cache_key_type> usage_list_type;
+struct primitive_cache_value_type {
+  ::dnnl::primitive *_primitive;
+  std::unordered_map<int, ::dnnl::memory> *_args;
+  usage_list_type::iterator _usage_it;
+  std::function<void(::dnnl::primitive *)> _destructor;
+  sycl::event _e;
+  sycl::queue _q;
+  primitive_cache_value_type(
+      ::dnnl::primitive *primitive,
+      std::unordered_map<int, ::dnnl::memory> *args,
+      usage_list_type::iterator usage_it,
+      std::function<void(::dnnl::primitive *)> destructor, sycl::event e,
+      sycl::queue q)
+      : _primitive(primitive), _args(args), _usage_it(usage_it),
+        _destructor(destructor), _e(e), _q(q) {}
+};
+struct primitive_and_args {
+  ::dnnl::primitive *primitive;
+  std::unordered_map<int, ::dnnl::memory> *args;
+};
+typedef std::unordered_map<primitive_cache_key_type,
+                           std::shared_ptr<primitive_cache_value_type>>
+    cache_map_type;
+
+// The primitive cache uses LRU replacement policy, and the default cache
+// capacity is 1024.
+class primitive_cache {
+  int _capacity = 1024;
+  usage_list_type usage;
+  cache_map_type cache_map;
+  void touch(cache_map_type::iterator it, sycl::event e = {},
+             bool update_event = false) {
+    if (it->second->_usage_it != usage.begin()) {
+      const primitive_cache_key_type &key = it->first;
+      usage.erase(it->second->_usage_it);
+      usage.push_front(key);
+      it->second->_usage_it = usage.begin();
+    }
+    if (update_event) {
+      it->second->_e = e;
+    }
+  }
+
+public:
+  std::shared_ptr<primitive_cache_value_type>
+  get(const primitive_cache_key_type &key) {
+    auto it = cache_map.find(key);
+    if (it == cache_map.end()) {
+      return nullptr;
+    }
+    touch(it);
+    return it->second;
+  }
+  void put(const primitive_cache_key_type &key, ::dnnl::primitive *value,
+           std::unordered_map<int, ::dnnl::memory> *args,
+           std::function<void(::dnnl::primitive *)> destructor, sycl::event e,
+           sycl::queue *q) {
+    auto it = cache_map.find(key);
+    if (it != cache_map.end()) {
+      touch(it, e, true);
+    } else {
+      if (cache_map.size() == _capacity) {
+        auto v = *(cache_map.find(usage.back())->second);
+        v._q.submit([=](sycl::handler &cgh) {
+          cgh.depends_on(v._e);
+          cgh.host_task([=] {
+            delete v._args;
+            v._destructor(v._primitive);
+          });
+        });
+        cache_map.erase(usage.back());
+        usage.pop_back();
+      }
+      usage.push_front(key);
+      cache_map[key] = std::make_shared<primitive_cache_value_type>(
+          value, args, usage.begin(), destructor, e, *q);
+    }
+  }
+};
+} // namespace detail
+
+/// A class holding the oneDNN engine.
+class engine_ext {
+  struct output_argument_info {
+    float _alpha;
+    float _beta;
+    int _name;
+    memory_desc_ext _desc;
+    void *_data;
+    output_argument_info(float alpha, float beta, int name,
+                         memory_desc_ext desc, void *data)
+        : _alpha(alpha), _beta(beta), _name(name), _desc(desc), _data(data) {}
+    output_argument_info(float alpha, float beta, memory_desc_ext desc,
+                         void *data)
+        : _alpha(alpha), _beta(beta), _name(0), _desc(desc), _data(data) {}
+  };
+  struct buffer_info {
+    size_t capacity = 0;
+    uint8_t *buffer = nullptr;
+    size_t usage = 0;
+    sycl::queue q;
+    sycl::event deps;
+    size_t primitive_depth = 0;
+  };
+  struct internal_resource {
+    std::int64_t random_engine_state_size = -1;
+    buffer_info binfo;
+  };
+  std::shared_ptr<::dnnl::engine> _eng = nullptr;
+  std::shared_ptr<::dnnl::stream> _s = nullptr;
+  sycl::queue *_q = nullptr;
+  unsigned int _engine_id = 0;
+  static thread_local unsigned int _engine_count;
+  static thread_local std::map<void *, ::dnnl::memory> _workspace_map;
+  static thread_local std::map<sycl::queue *,
+                               std::shared_ptr<internal_resource>>
+      _internal_resource_cache;
+  static thread_local detail::primitive_cache _primitive_cache;
+  ::dnnl::memory &get_workspace(void *key) { return _workspace_map[key]; }
+  void insert_workspace(void *key, ::dnnl::memory workspace) {
+    _workspace_map[key] = workspace;
+  }
+  const ::dnnl::stream &get_stream() const { return *_s; }
+  const ::dnnl::engine &get_engine() const { return *_eng; }
+
+  void *allocate(const memory_desc_ext &desc, int count = 1);
+  void *allocate(size_t size);
+  std::shared_ptr<internal_resource> get_internal_resource(sycl::queue *q){
+    auto it = _internal_resource_cache.find(_q);
+    if (it == _internal_resource_cache.end()) {
+      return _internal_resource_cache[_q] = std::make_shared<internal_resource>();
+    }
+    return it->second;
+  }
+  void enter_primitive(size_t request_buffer_size = 0) {
+    auto &info = get_internal_resource(_q)->binfo;
+    if (info.primitive_depth == 0) {
+      info.usage = 0;
+      if (request_buffer_size > info.capacity) {
+        if (info.buffer && (info.capacity != 0)) {
+          auto ainfo = info;
+          ainfo.q.submit([=](sycl::handler &cgh) {
+            cgh.depends_on(ainfo.deps);
+            cgh.host_task([=] { sycl::free(ainfo.buffer, ainfo.q); });
+          });
+        }
+        size_t new_buffer_capacity =
+            std::max(request_buffer_size, info.capacity * 2);
+        info.capacity = new_buffer_capacity;
+        info.buffer = (uint8_t *)sycl::malloc_device(new_buffer_capacity, *_q);
+        info.q = *_q;
+        info.deps = sycl::event();
+      }
+    }
+    info.primitive_depth++;
+  }
+  sycl::event exit_primitive(const sycl::event &e) {
+    auto &info = get_internal_resource(_q)->binfo;
+    info.primitive_depth--;
+    if ((info.primitive_depth == 0) && info.usage) {
+      info.deps = e;
+    }
+    return e;
+  }
+  ::dnnl::memory::desc
+  compress_spatial_dimensions_to_channel(const ::dnnl::memory::desc &desc);
+  ::dnnl::memory::desc
+  get_bn_scale_bias_mean_var_desc(const ::dnnl::memory::desc &desc,
+                                  batch_normalization_mode mode);
+  sycl::event batch_normalization_backward_internal(
+      batch_normalization_mode mode, float epsilon, float alpha_data,
+      const memory_desc_ext &src_desc, void *src,
+      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
+      const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param,
+      const memory_desc_ext &diff_scale_bias_desc, void *scale, void *bias,
+      float beta_param, void *diff_scale, void *diff_bias,
+      const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var);
+  sycl::event batch_normalization_forward_internal(
+      bool is_infer, batch_normalization_mode mode, float epsilon, float factor,
+      float alpha, const memory_desc_ext &src_desc, void *src, float beta,
+      const memory_desc_ext &dst_desc, void *dst,
+      const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
+      const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var,
+      void *running_mean, void *running_var);
+  ::dnnl::memory::desc
+  transfer_memory_desc_to_channel_major_format(const ::dnnl::memory::desc &desc);
+  ::dnnl::memory::desc
+  bn_reorder_memory_to_channel_major_format(
+      bool is_input, ::dnnl::memory::desc &desc, void *src, void **cache);
+  ::dnnl::memory::desc
+  transfer_memory_desc_to_format_tag_any(const ::dnnl::memory::desc &desc){
+    return ::dnnl::memory::desc(desc.get_dims(), desc.get_data_type(),
+                                ::dnnl::memory::format_tag::any);
+  }
+  void allocate_and_reorder_memory_to_optimal(::dnnl::memory::desc &from_desc,
+                                              void *&from,
+                                              ::dnnl::memory::desc &to_desc,
+                                              void *&to) {
+    if (from_desc != to_desc) {
+      to = allocate(to_desc);
+      async_reorder(1.f, from_desc, from, 0.f, to_desc, to);
+    }
+  }
+  template <typename primitive_type, typename... args_type>
+  std::pair<detail::primitive_cache_key_type, detail::primitive_and_args>
+  create_primitive_args_or_get(args_type &&...args);
+  template <typename primitive_type>
+  typename primitive_type::primitive_desc
+  get_primitive_desc(::dnnl::primitive *p);
+  template <typename primitive_type, typename... args_type>
+  typename primitive_type::primitive_desc
+  create_primitive_desc(args_type &&...args);
+  template <typename T>
+  void generate_cache_key(std::string &key_buffer, const T &arg);
+  template <typename T, typename... args_type>
+  void generate_cache_key(std::string &key_buffer, const T &first_arg,
+                          const args_type &...args);
+  void insert_arg(std::unordered_map<int, ::dnnl::memory> *args, int name,
+                  const ::dnnl::memory::desc &desc, void *data) {
+    auto it = args->find(name);
+    if (it != args->end()) {
+      it->second.set_data_handle(data);
+    } else {
+      args->insert({name, ::dnnl::memory(desc, *_eng, data)});
+    }
+  }
+  void insert_arg(std::unordered_map<int, ::dnnl::memory> *args, int name,
+                  const ::dnnl::memory &mem) {
+    (*args)[name] = mem;
+  }
+  sycl::event execute_rnn_forward_primitive(
+      rnn_mode mode, ::dnnl::prop_kind kind, ::dnnl::rnn_direction direction,
+      rnn_bias_mode bias_mode, ::dnnl::memory::data_type dt,
+      ::dnnl::memory::format_tag tag, int seq_length, int batch_size, int src_c,
+      int dst_c, int layer_size, int direction_num, int hidden_size,
+      int gate_num, int projection_size, std::vector<void *> &data,
+      std::vector<int> &offset, int iter_num, size_t *weight_size = nullptr,
+      size_t *workspace_size = nullptr, size_t *scratchpad_size = nullptr);
+
+  sycl::event rnn_forward_internal(
+      const rnn_desc &desc, ::dnnl::prop_kind kind,
+      const memory_desc_ext &src_desc, void *src,
+      const memory_desc_ext &dst_desc, void *dst,
+      const memory_desc_ext &iter_desc, void *src_iter, void *dst_iter,
+      const memory_desc_ext &iter_c_desc, void *src_iter_c, void *dst_iter_c,
+      size_t weight_size, void *weight, size_t workspace_size, void *workspace,
+      size_t scratchpad_size, void *scratchpad, bool is_get_execution_args,
+      size_t *weight_size_query, size_t *workspace_size_query,
+      size_t *scratchpad_size_query);
+
+  sycl::event execute_rnn_backward_primitive(
+      rnn_mode mode, ::dnnl::rnn_direction direction, rnn_bias_mode bias_mode,
+      ::dnnl::memory::data_type dt, ::dnnl::memory::format_tag tag,
+      int seq_length, int batch_size, int src_c, int dst_c, int layer_size,
+      int direction_num, int hidden_size, int gate_num, int projection_size,
+      std::vector<void *> &data, std::vector<int> &offset, int iter_num);
+  bool
+  scale_parameter_preprocess(const std::vector<output_argument_info> &args);
+  template <typename primitive_type>
+  sycl::event
+  execute_primitive(const std::pair<detail::primitive_cache_key_type,
+                                    detail::primitive_and_args> &primitive,
+                    const std::vector<output_argument_info> &extra_args = {});
+  template <typename T>
+  sycl::event fill_with_type(sycl::queue *q, void *src, const void *value,
+                             size_t size_with_byte) {
+    return q->fill<T>(static_cast<T *>(src), *static_cast<const T *>(value),
+                      size_with_byte / sizeof(T));
+  }
+  template <typename T> struct no_zero_op {
+    T operator()(T e) {
+      if (!e) {
+        return 1;
+      }
+      return e;
+    }
+  };
+  template <typename T>
+  void transform_no_zero_with_type(sycl::queue *q, void *src, void *dst,
+                                   size_t num) {
+    std::transform(oneapi::dpl::execution::make_device_policy(*q),
+                   static_cast<T *>(src), static_cast<T *>(src) + num,
+                   static_cast<T *>(dst), no_zero_op<T>());
+  }
+  void transform_no_zero(const memory_desc_ext &desc, void *src, void *dst);
+  ::dnnl::memory::desc get_group_weight_desc(int group_count,
+                                             const memory_desc_ext &weight_desc);
+  void get_rnn_configuration(const ::dnnl::memory::desc &desc,
+                             rnn_direction direction, rnn_mode mode,
+                             dpct::library_data_t dt, int hidden_size,
+                             ::dnnl::memory::data_type *dnnl_dt,
+                             ::dnnl::memory::format_tag *tag,
+                             int *projection_size, int *output_size,
+                             int *seq_length, int *batch_size,
+                             int *direction_num, int *gate_num);
+public:
+  engine_ext() {}
+  operator bool() const {
+    return bool(_eng) && bool(_s) && bool(_q);
+  }
+  engine_ext &operator=(std::nullptr_t) {
+    _eng = nullptr;
+    _s = nullptr;
+    _q = nullptr;
+    return *this;
+  }
+  /// Creating oneDNN engine.
+  void create_engine() {
+    _q = &dpct::get_current_device().default_queue();
+    _eng = std::make_shared<::dnnl::engine>(::dnnl::sycl_interop::make_engine(
+        dpct::get_current_device(), dpct::get_current_device().get_context()));
+    _s = std::make_shared<::dnnl::stream>(
+        ::dnnl::sycl_interop::make_stream(*_eng, *_q));
+    _engine_id = _engine_count++;
+  }
+  /// Setting the user's SYCL queue for an oneDNN engine.
+  /// \param [in] q Pointer to the SYCL queue.
+  void set_queue(sycl::queue *q) {
+    if (!q) {
+      throw std::runtime_error("set_queue: pointer must not be nullptr.");
+    }
+    if (!_eng) {
+      throw std::runtime_error("set_queue: current engine is invalid.");
+    }
+    if (q->get_context() != ::dnnl::sycl_interop::get_context(*_eng)) {
+      throw std::runtime_error(
+          "set_queue: queue is mismatch with current engine context.");
+    }
+    _q = q;
+    _s = std::make_shared<::dnnl::stream>(
+        ::dnnl::sycl_interop::make_stream(*_eng, *_q));
+  }
+  /// Retrieving the user's SYCL queue set in the oneDNN engine.
+  /// \returns Pointer to the SYCL queue.
+  sycl::queue *get_queue() const { return _q; }
+  /// Setting all elements of a memory to a given value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] valuePtr Pointer to a single value.
+  void fill(const memory_desc_ext &src_desc, void *src,
+                   const void *valuePtr);
+  /// Coping the scaled data from a memory to another memory with a different
+  /// description.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  void reorder(float alpha, const memory_desc_ext &src_desc, void *src,
+                      float beta, const memory_desc_ext &dst_desc, void *dst);
+  /// Scaling all the elements of a memory by a given factor.
+  /// \param [in] alpha Value to scaling factors.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [out] src Pointer to source data.
+  void scale(float alpha, const memory_desc_ext &src_desc, void *src);
+  /// Adding the scaled values of a memory to another memory.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  void sum(float alpha, const memory_desc_ext &src_desc, void *src,
+                  float beta, const memory_desc_ext &dst_desc, void *dst);
+  /// Computing a specified activation function value.
+  /// \param [in] desc Activation descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  void activation_forward(activation_desc &desc, float alpha,
+                                 const memory_desc_ext &src_desc, void *src,
+                                 float beta, const memory_desc_ext &dst_desc,
+                                 void *dst);
+  /// Computing the gradient of a specified activation function.
+  /// \param [in] desc Activation descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the differential destination memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  void
+  activation_backward(activation_desc &desc, float alpha,
+                      const memory_desc_ext &dst_desc, void *dst,
+                      const memory_desc_ext &diff_dst_desc, void *diff_dst,
+                      const memory_desc_ext &src_desc, void *src, float beta,
+                      const memory_desc_ext &diff_src_desc, void *diff_src);
+  /// Computing a specified pooling function value.
+  /// \param [in] desc Pooling descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [out] workspace Pointer to workspace generated from forward propagation.
+  void pooling_forward(pooling_desc &desc, float alpha,
+                              const memory_desc_ext &src_desc, void *src,
+                              float beta, const memory_desc_ext &dst_desc,
+                              void *dst, ::dnnl::memory *workspace = nullptr);
+  /// Computing the gradient of a specified pooling function.
+  /// \param [in] desc Activation descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the differential destination memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential
+  /// source data. 
+  /// \param [in] workspace Pointer to workspace used for backward
+  /// propagation.
+  void pooling_backward(pooling_desc &desc, float alpha,
+                               const memory_desc_ext &dst_desc, void *dst,
+                               const memory_desc_ext &diff_dst_desc,
+                               void *diff_dst, const memory_desc_ext &src_desc,
+                               void *src, float beta,
+                               const memory_desc_ext &diff_src_desc,
+                               void *diff_src,
+                               ::dnnl::memory *workspace = nullptr);
+  /// Computing a specified softmax function value.
+  /// \param [in] alg Softmax algorithm.
+  /// \param [in] mode Softmax mode.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value. 
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data. 
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  void softmax_forward(softmax_algorithm alg, softmax_mode mode,
+                              float alpha, const memory_desc_ext &src_desc,
+                              void *src, float beta,
+                              const memory_desc_ext &dst_desc, void *dst);
+  /// Computing the gradient of a specified softmax function.
+  /// \param [in] alg Softmax algorithm.
+  /// \param [in] mode Softmax mode.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value. 
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the differential destination memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  void softmax_backward(softmax_algorithm alg, softmax_mode mode,
+                               float alpha, const memory_desc_ext &dst_desc,
+                               void *dst, const memory_desc_ext &diff_dst_desc,
+                               void *diff_dst, float beta,
+                               const memory_desc_ext &diff_src_desc,
+                               void *diff_src);
+  /// Computing a specified local response normalization function value.
+  /// \param [in] desc Local response normalization descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [out] workspace Pointer to workspace generated from forward
+  /// propagation.
+  void lrn_forward(lrn_desc &desc, float alpha,
+                          const memory_desc_ext &src_desc, void *src,
+                          float beta, const memory_desc_ext &dst_desc,
+                          void *dst, ::dnnl::memory *workspace = nullptr);
+  /// Computing the gradient of a specified local response normalization
+  /// function.
+  /// \param [in] desc Local response normalization descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed value.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the differential destination memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \param [in] workspace Pointer to workspace used for backward propagation.
+  void lrn_backward(lrn_desc &desc, float alpha,
+                           const memory_desc_ext &dst_desc, void *dst,
+                           const memory_desc_ext &diff_dst_desc, void *diff_dst,
+                           const memory_desc_ext &src_desc, void *src,
+                           float beta, const memory_desc_ext &diff_src_desc,
+                           void *diff_src, ::dnnl::memory *workspace = nullptr);
+  /// Setting all elements of a memory to a given value asynchronously.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] valuePtr Pointer to a single value.
+  /// \returns An event representing the fill operations.
+  sycl::event async_fill(const memory_desc_ext &src_desc, void *src,
+                   const void *valuePtr);
+  /// Coping the scaled data from a memory to another memory with a different
+  /// description asynchronously.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \returns An event representing the reorder operations.
+  sycl::event async_reorder(float alpha, const memory_desc_ext &src_desc, void *src,
+                      float beta, const memory_desc_ext &dst_desc, void *dst);
+  /// Scaling all the elements of a memory by a given factor asynchronously.
+  /// \param [in] alpha Value to scaling factors.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [out] src Pointer to source data.
+  /// \returns An event representing the scale operations.
+  sycl::event async_scale(float alpha, const memory_desc_ext &src_desc, void *src);
+  /// Adding the scaled values of a memory to another memory asynchronously.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \returns An event representing the sum operations.
+  sycl::event async_sum(float alpha, const memory_desc_ext &src_desc, void *src,
+                  float beta, const memory_desc_ext &dst_desc, void *dst);
+
+  /// Perform specified binary operation asynchronously.
+  /// \param [in] op Specified binary operation.
+  /// \param [in] alpha_0 Value to scaling factors used to scale the src_0
+  /// value.
+  /// \param [in] src_desc_0 Source 0 memory descriptor.
+  /// \param [in] src_0 Pointer to source 0 data.
+  /// \param [in] alpha_1 Value to scaling factors used to scale the src_1
+  /// value.
+  /// \param [in] src_desc_1 Source 1 memory descriptor.
+  /// \param [in] src_1 Pointer to source 1 data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \returns An event representing the binary operations.
+  sycl::event async_binary(binary_op op, float alpha_0,
+                     const memory_desc_ext &src_desc_0, void *src_0,
+                     float alpha_1, const memory_desc_ext &src_desc_1,
+                     void *src_1, float beta, const memory_desc_ext &dst_desc,
+                     void *dst);
+
+  /// Perform specified binary operation asynchronously.
+  /// \param [in] op Specified reduction operation.
+  /// \param [in] alpha Value to scaling factors used to scale the data
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \returns An event representing the reduction operations.
+  sycl::event async_reduction(reduction_op op, float alpha,
+                        const memory_desc_ext &src_desc, void *src, float beta,
+                        const memory_desc_ext &dst_desc, void *dst);
+  /// Computing a specified activation function value asynchronously.
+  /// \param [in] desc Activation descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \returns An event representing the activation forward operations.
+  sycl::event async_activation_forward(activation_desc &desc, float alpha,
+                                 const memory_desc_ext &src_desc, void *src,
+                                 float beta, const memory_desc_ext &dst_desc,
+                                 void *dst);
+  /// Computing the gradient of a specified activation function asynchronously.
+  /// \param [in] desc Activation descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the differential destination memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \returns An event representing the activation backward operations.
+  sycl::event
+  async_activation_backward(activation_desc &desc, float alpha,
+                      const memory_desc_ext &dst_desc, void *dst,
+                      const memory_desc_ext &diff_dst_desc, void *diff_dst,
+                      const memory_desc_ext &src_desc, void *src, float beta,
+                      const memory_desc_ext &diff_src_desc, void *diff_src);
+  /// Computing a specified pooling function value asynchronously.
+  /// \param [in] desc Pooling descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [out] workspace Pointer to workspace generated from forward propagation.
+  /// \returns An event representing the pooling forward operations.
+  sycl::event async_pooling_forward(pooling_desc &desc, float alpha,
+                              const memory_desc_ext &src_desc, void *src,
+                              float beta, const memory_desc_ext &dst_desc,
+                              void *dst, ::dnnl::memory *workspace = nullptr);
+  /// Computing the gradient of a specified pooling function asynchronously.
+  /// \param [in] desc Activation descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the differential destination memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential
+  /// source data. 
+  /// \param [in] workspace Pointer to workspace used for backward
+  /// propagation.
+  /// \returns An event representing the pooling backward operations.
+  sycl::event async_pooling_backward(pooling_desc &desc, float alpha,
+                               const memory_desc_ext &dst_desc, void *dst,
+                               const memory_desc_ext &diff_dst_desc,
+                               void *diff_dst, const memory_desc_ext &src_desc,
+                               void *src, float beta,
+                               const memory_desc_ext &diff_src_desc,
+                               void *diff_src,
+                               ::dnnl::memory *workspace = nullptr);
+  /// Computing a specified softmax function value asynchronously.
+  /// \param [in] alg Softmax algorithm.
+  /// \param [in] mode Softmax mode.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value. 
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data. 
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \returns An event representing the softmax forward operations.
+  sycl::event async_softmax_forward(softmax_algorithm alg, softmax_mode mode,
+                              float alpha, const memory_desc_ext &src_desc,
+                              void *src, float beta,
+                              const memory_desc_ext &dst_desc, void *dst);
+  /// Computing the gradient of a specified softmax function asynchronously.
+  /// \param [in] alg Softmax algorithm.
+  /// \param [in] mode Softmax mode.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value. 
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the differential destination memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \returns An event representing the softmax backward operations.
+  sycl::event async_softmax_backward(softmax_algorithm alg, softmax_mode mode,
+                               float alpha, const memory_desc_ext &dst_desc,
+                               void *dst, const memory_desc_ext &diff_dst_desc,
+                               void *diff_dst, float beta,
+                               const memory_desc_ext &diff_src_desc,
+                               void *diff_src);
+  /// Computing a specified local response normalization function value
+  /// asynchronously.
+  /// \param [in] desc Local response normalization descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [out] workspace Pointer to workspace generated from forward
+  /// propagation.
+  /// \returns An event representing the lrn forward operations.
+  sycl::event async_lrn_forward(lrn_desc &desc, float alpha,
+                          const memory_desc_ext &src_desc, void *src,
+                          float beta, const memory_desc_ext &dst_desc,
+                          void *dst, ::dnnl::memory *workspace = nullptr);
+  /// Computing the gradient of a specified local response normalization
+  /// function asynchronously.
+  /// \param [in] desc Local response normalization descriptor.
+  /// \param [in] alpha Value to scaling factors used to scale the computed value.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the differential destination memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \param [in] workspace Pointer to workspace used for backward propagation.
+  /// \returns An event representing the lrn backward operations.
+  sycl::event async_lrn_backward(lrn_desc &desc, float alpha,
+                           const memory_desc_ext &dst_desc, void *dst,
+                           const memory_desc_ext &diff_dst_desc, void *diff_dst,
+                           const memory_desc_ext &src_desc, void *src,
+                           float beta, const memory_desc_ext &diff_src_desc,
+                           void *diff_src, ::dnnl::memory *workspace = nullptr);
+
+  /// Derives a memory descriptor for the batch normalization scale, bias, mean,
+  /// variance from the source memory descriptor and batch normalization mode.
+  /// \param [out] desc Derived memory descriptor.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] mode Batch normalization mode.
+  static void derive_batch_normalization_memory_desc(memory_desc_ext &desc,
+                                              const memory_desc_ext &src_desc,
+                                              batch_normalization_mode mode);
+
+  /// Derives a memory descriptor for the batch normalization scale, bias, mean,
+  /// variance from the source memory descriptor and batch normalization mode.
+  /// \param [out] scale_bias_desc Derived scale and bias memory descriptor.
+  /// \param [out] mean_var_desc Derived mean and var memory descriptor.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] mode Batch normalization mode.
+  static void derive_batch_normalization_memory_desc(memory_desc_ext &scale_bias_desc,
+                                             memory_desc_ext &mean_var_desc,
+                                             const memory_desc_ext &src_desc,
+                                             batch_normalization_mode mode);
+
+  /// Get the size of workspace that needed by batch normalization. The data stored
+  /// in workspace must be preserved between forward and backward.
+  /// \param [in] ops Batch normalization operation mode. This mode can set to
+  /// perform only batch normalization, or batch normalization followed by
+  /// activation, or batch normalization followed by element-wise addition and
+  /// activation.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \returns Size of workspace.
+  size_t get_batch_normalization_workspace_size(
+    batch_normalization_ops ops, const memory_desc_ext &src_desc);
+
+  /// Computing a specified batch normalization inference stage function value
+  /// asynchronously.
+  /// \param [in] mode Batch normalization mode.
+  /// \param [in] epsilon Epsilon value used in computation.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [in] scale_bias_mean_var_desc Scale, bias, mean, variance memory
+  /// descriptor.
+  /// \param [in] scale Pointer to scale data.
+  /// \param [in] bias Pointer to bias data.
+  /// \param [in] mean Pointer to mean data.
+  /// \param [in] var Pointer to variance data.
+  /// \returns An event representing the batch normalization forward operations.
+  sycl::event async_batch_normalization_forward_inference(
+      batch_normalization_mode mode, float epsilon, float alpha,
+      const memory_desc_ext &src_desc, void *src, float beta,
+      const memory_desc_ext &dst_desc, void *dst,
+      const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
+      void *mean, void *var);
+
+  /// Computing a specified batch normalization inference stage function value
+  /// asynchronously.
+  /// \param [in] mode Batch normalization mode.
+  /// \param [in] ops Batch normalization operation mode. This mode can set to
+  /// perform only batch normalization, or batch normalization followed by
+  /// activation, or batch normalization followed by element-wise addition and
+  /// activation.
+  /// \param [in] adesc Activation operation descriptor.
+  /// \param [in] epsilon Epsilon value used in computation.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [in] summand_desc Summand memory descriptor.
+  /// \param [in] summand Pointer to summand data.
+  /// \param [in] scale_bias_desc Scale, bias memory descriptor.
+  /// \param [in] scale Pointer to scale data.
+  /// \param [in] bias Pointer to bias data.
+  /// \param [in] mean_var_desc Mean, variance memory descriptor.
+  /// \param [in] mean Pointer to mean data.
+  /// \param [in] var Pointer to variance data.
+  /// \returns An event representing the batch normalization forward operations.
+  sycl::event async_batch_normalization_forward_inference(
+      batch_normalization_mode mode, batch_normalization_ops ops,
+      activation_desc &adesc, float epsilon, float alpha,
+      const memory_desc_ext &src_desc, void *src, float beta,
+      const memory_desc_ext &dst_desc, void *dst,
+      const memory_desc_ext &summand_desc, void *summand,
+      const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
+      const memory_desc_ext &mean_var_desc, void *mean, void *var);
+
+  /// Computing a specified batch normalization training stage function value
+  /// asynchronously.
+  /// \param [in] mode Batch normalization mode.
+  /// \param [in] epsilon Epsilon value used in computation.
+  /// \param [in] factor Factor value used in running mean and variance
+  /// computation.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [in] scale_bias_mean_var_desc Scale, bias, mean, variance memory
+  /// descriptor.
+  /// \param [in] scale Pointer to scale data.
+  /// \param [in] bias Pointer to bias data.
+  /// \param [out] running_mean Pointer to running mean data.
+  /// \param [out] running_var Pointer to running variance data.
+  /// \param [out] saved_mean Pointer to optional cache to save mean data.
+  /// \param [out] saved_var Pointer to optional cache to save variance data.
+  /// \returns An event representing the batch normalization forward operations.
+  sycl::event async_batch_normalization_forward_training(
+      batch_normalization_mode mode, float epsilon, float factor, float alpha,
+      const memory_desc_ext &src_desc, void *src, float beta,
+      const memory_desc_ext &dst_desc, void *dst,
+      const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
+      void *running_mean, void *running_var, void *saved_mean, void *saved_var);
+
+  /// Computing a specified batch normalization training stage function value
+  /// asynchronously.
+  /// \param [in] mode Batch normalization mode.
+  /// \param [in] ops Batch normalization operation mode. This mode can set to
+  /// perform only batch normalization, or batch normalization followed by
+  /// activation, or batch normalization followed by element-wise addition and
+  /// activation.
+  /// \param [in] adesc Activation operation descriptor.
+  /// \param [in] epsilon Epsilon value used in computation.
+  /// \param [in] factor Factor value used in running mean and variance
+  /// computation.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [in] summand_desc Summand memory descriptor.
+  /// \param [in] summand Pointer to summand data.
+  /// \param [in] scale_bias_mean_var_desc Scale, bias, mean, variance memory
+  /// descriptor.
+  /// \param [in] scale Pointer to scale data.
+  /// \param [in] bias Pointer to bias data.
+  /// \param [out] running_mean Pointer to running mean data.
+  /// \param [out] running_var Pointer to running variance data.
+  /// \param [out] saved_mean Pointer to optional cache to save mean data.
+  /// \param [out] saved_var Pointer to optional cache to save variance data.
+  /// \param [in] workspace_size Size of workspace.
+  /// \param [out] workspace Pointer to workspace generated from forward
+  /// propagation.
+  /// \returns An event representing the batch normalization forward operations.
+  sycl::event async_batch_normalization_forward_training(
+      batch_normalization_mode mode, batch_normalization_ops ops,
+      activation_desc &adesc, float epsilon, float factor, float alpha,
+      const memory_desc_ext &src_desc, void *src, float beta,
+      const memory_desc_ext &dst_desc, void *dst,
+      const memory_desc_ext &summand_desc, void *summand,
+      const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
+      void *running_mean, void *running_var, void *saved_mean, void *saved_var,
+      size_t workspace_size, void *workspace);
+
+  /// Computing a specified batch normalization training stage function value
+  /// asynchronously.
+  /// \param [in] mode Batch normalization mode.
+  /// \param [in] ops Batch normalization operation mode. This mode can set to
+  /// perform only batch normalization, or batch normalization followed by
+  /// activation, or batch normalization followed by element-wise addition and
+  /// activation.
+  /// \param [in] adesc Activation operation descriptor.
+  /// \param [in] epsilon Epsilon value used in computation.
+  /// \param [in] factor Factor value used in running mean and variance
+  /// computation.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [in] summand_desc Summand memory descriptor.
+  /// \param [in] summand Pointer to summand data.
+  /// \param [in] scale_bias_desc Scale, bias memory descriptor.
+  /// \param [in] scale Pointer to scale data.
+  /// \param [in] bias Pointer to bias data.
+  /// \param [in] mean_var_desc Mean, variance memory descriptor.
+  /// \param [out] running_mean Pointer to running mean data.
+  /// \param [out] running_var Pointer to running variance data.
+  /// \param [out] saved_mean Pointer to optional cache to save mean data.
+  /// \param [out] saved_var Pointer to optional cache to save variance data.
+  /// \param [in] workspace_size Size of workspace.
+  /// \param [out] workspace Pointer to workspace generated from forward
+  /// propagation.
+  /// \returns An event representing the batch normalization forward operations.
+  sycl::event async_batch_normalization_forward_training(
+      batch_normalization_mode mode, batch_normalization_ops ops,
+      activation_desc &adesc, float epsilon, float factor, float alpha,
+      const memory_desc_ext &src_desc, void *src, float beta,
+      const memory_desc_ext &dst_desc, void *dst,
+      const memory_desc_ext &summand_desc, void *summand,
+      const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
+      const memory_desc_ext &mean_var_desc, void *running_mean, void *running_var,
+      void *saved_mean, void *saved_var, size_t workspace_size, void *workspace);
+
+  /// Computing the gradient of a specified batch normalization function asynchronously.
+  /// \param [in] mode Batch normalization mode.
+  /// \param [in] epsilon Epsilon value used in computation.
+  /// \param [in] alpha_data Value to scaling factors used to scale the computed
+  /// data value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] beta_data Value to scaling factors used to scale the prior value
+  /// in the data memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \param [in] alpha_param Value to scaling factors used to scale the computed
+  /// parameter value.
+  /// \param [in] diff_scale_bias_mean_var_desc Differential scale, bias, mean,
+  /// variance memory descriptor.
+  /// \param [in] scale Pointer to scale data.
+  /// \param [in] beta_param Value to scaling factors used to scale the prior value
+  /// in the parameter memory.
+  /// \param [in] diff_scale Pointer to differential scale data.
+  /// \param [in] diff_bias Pointer to differential bias data.
+  /// \param [in] saved_mean Pointer to optional cache saved mean data in forward.
+  /// \param [in] saved_var Pointer to optional cache saved variance data in forward.
+  /// \returns An event representing the batch normalization backward operations.
+  sycl::event async_batch_normalization_backward(
+      batch_normalization_mode mode, float epsilon, float alpha_data,
+      const memory_desc_ext &src_desc, void *src,
+      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
+      const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param,
+      const memory_desc_ext &diff_scale_bias_mean_var_desc, void *scale,
+      float beta_param, void *diff_scale, void *diff_bias, void *saved_mean,
+      void *saved_var);
+
+  /// Computing the gradient of a specified batch normalization function
+  /// asynchronously.
+  /// \param [in] mode Batch normalization mode.
+  /// \param [in] ops Batch normalization operation mode. This mode can set to
+  /// perform only batch normalization, or batch normalization followed by
+  /// activation, or batch normalization followed by element-wise addition and
+  /// activation.
+  /// \param [in] adesc Activation operation descriptor.
+  /// \param [in] epsilon Epsilon value used in computation.
+  /// \param [in] alpha_data Value to scaling factors used to scale the computed
+  /// data value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] beta_data Value to scaling factors used to scale the prior value
+  /// in the data memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \param [in] diff_summand_desc Differential summand memory descriptor.
+  /// \param [out] diff_summand Pointer to differential summand data.
+  /// \param [in] alpha_param Value to scaling factors used to scale the computed
+  /// parameter value.
+  /// \param [in] diff_scale_bias_mean_var_desc Differential scale, bias, mean,
+  /// variance memory descriptor.
+  /// \param [in] scale Pointer to scale data.
+  /// \param [in] bias Pointer to bias data.
+  /// \param [in] beta_param Value to scaling factors used to scale the prior value
+  /// in the parameter memory.
+  /// \param [out] diff_scale Pointer to differential scale data.
+  /// \param [out] diff_bias Pointer to differential bias data.
+  /// \param [in] saved_mean Pointer to optional cache saved mean data in forward.
+  /// \param [in] saved_var Pointer to optional cache saved variance data in forward.
+  /// \param [in] workspace_size Size of workspace.
+  /// \param [in] workspace Pointer to workspace used for backward propagation.
+  /// \returns An event representing the batch normalization backward operations.
+  sycl::event async_batch_normalization_backward(
+      batch_normalization_mode mode, batch_normalization_ops ops,
+      activation_desc &adesc, float epsilon, float alpha_data,
+      const memory_desc_ext &src_desc, void *src,
+      const memory_desc_ext &dst_desc, void *dst,
+      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
+      const memory_desc_ext &diff_src_desc, void *diff_src,
+      const memory_desc_ext &diff_summand_desc, void *diff_summand,
+      float alpha_param, const memory_desc_ext &diff_scale_bias_mean_var_desc,
+      void *scale, void *bias, float beta_param, void *diff_scale,
+      void *diff_bias, void *saved_mean, void *saved_var,
+      size_t workspace_size, void *workspace);
+
+  /// Computing the gradient of a specified batch normalization function
+  /// asynchronously.
+  /// \param [in] mode Batch normalization mode.
+  /// \param [in] ops Batch normalization operation mode. This mode can set to
+  /// perform only batch normalization, or batch normalization followed by
+  /// activation, or batch normalization followed by element-wise addition and
+  /// activation.
+  /// \param [in] adesc Activation operation descriptor.
+  /// \param [in] epsilon Epsilon value used in computation.
+  /// \param [in] alpha_data Value to scaling factors used to scale the computed
+  /// data value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] beta_data Value to scaling factors used to scale the prior value
+  /// in the data memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \param [in] diff_summand_desc Differential summand memory descriptor.
+  /// \param [out] diff_summand Pointer to differential summand data.
+  /// \param [in] alpha_param Value to scaling factors used to scale the computed
+  /// parameter value.
+  /// \param [in] diff_scale_bias_desc Differential scale, bias memory descriptor.
+  /// \param [in] scale Pointer to scale data.
+  /// \param [in] bias Pointer to bias data.
+  /// \param [in] beta_param Value to scaling factors used to scale the prior value
+  /// in the parameter memory.
+  /// \param [out] diff_scale Pointer to differential scale data.
+  /// \param [out] diff_bias Pointer to differential bias data.
+  /// \param [in] mean_var_desc Differential mean, variance memory descriptor.
+  /// \param [in] saved_mean Pointer to optional cache saved mean data in forward.
+  /// \param [in] saved_var Pointer to optional cache saved variance data in forward.
+  /// \param [in] workspace_size Size of workspace.
+  /// \param [in] workspace Pointer to workspace used for backward propagation.
+  /// \returns An event representing the batch normalization backward operations.
+  sycl::event async_batch_normalization_backward(
+      batch_normalization_mode mode, batch_normalization_ops ops,
+      activation_desc &adesc, float epsilon, float alpha_data,
+      const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
+      void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
+      float beta_data, const memory_desc_ext &diff_src_desc, void *diff_src,
+      const memory_desc_ext &diff_summand_desc, void *diff_summand,
+      float alpha_param, const memory_desc_ext &diff_scale_bias_desc, void *scale,
+      void *bias, float beta_param, void *diff_scale, void *diff_bias,
+      const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var,
+      size_t workspace_size, void *workspace);
+
+  /// Computing a specified convolution function value asynchronously.
+  /// \param [in] desc Convolution descriptor.
+  /// \param [in] alg Convolution algorithm.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] weight_desc Weight memory descriptor.
+  /// \param [in] weight Pointer to weight data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \returns An event representing the convolution forward operations.
+  sycl::event async_convolution_forward(convolution_desc &desc, ::dnnl::algorithm alg,
+                                  float alpha, const memory_desc_ext &src_desc,
+                                  void *src, const memory_desc_ext &weight_desc,
+                                  void *weight, float beta,
+                                  const memory_desc_ext &dst_desc, void *dst);
+
+  /// Computing a specified convolution function value asynchronously.
+  /// \param [in] desc Convolution descriptor.
+  /// \param [in] alg Convolution algorithm.
+  /// \param [in] adesc Activation operation descriptor.
+  /// \param [in] alpha_0 Value to scaling factors used to scale the data
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] weight_desc Weight memory descriptor.
+  /// \param [in] weight Pointer to weight data.
+  /// \param [in] alpha_1 Value to scaling factors used to scale the summand
+  /// value.
+  /// \param [in] summand_desc Summand memory descriptor.
+  /// \param [in] summand Pointer to summand data.
+  /// \param [in] bias_desc Bias memory descriptor.
+  /// \param [in] bias Pointer to bias data.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \returns An event representing the convolution forward operations.
+  sycl::event async_convolution_forward(
+      convolution_desc &desc, ::dnnl::algorithm alg, activation_desc &adesc,
+      float alpha_0, const memory_desc_ext &src_desc, void *src,
+      const memory_desc_ext &weight_desc, void *weight, float alpha_1,
+      const memory_desc_ext &summand_desc, void *summand,
+      const memory_desc_ext &bias_desc, void *bias,
+      const memory_desc_ext &dst_desc, void *dst);
+
+  /// Computing the data gradient of a specified convolution function asynchronously.
+  /// \param [in] desc Convolution descriptor.
+  /// \param [in] alg Convolution algorithm.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] weight_desc Weight memory descriptor.
+  /// \param [in] weight Pointer to weight data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \returns An event representing the convolution backward data operations.
+  sycl::event async_convolution_backward_data(
+      convolution_desc &desc, ::dnnl::algorithm alg, float alpha,
+      const memory_desc_ext &weight_desc, void *weight,
+      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
+      const memory_desc_ext &diff_src_desc, void *diff_src);
+
+  /// Computing the weight gradient of a specified convolution function
+  /// asynchronously.
+  /// \param [in] desc Convolution descriptor.
+  /// \param [in] alg Convolution algorithm.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] diff_weight_desc Differential weight memory descriptor.
+  /// \param [out] diff_weight Pointer to differential weight data.
+  /// \returns An event representing the convolution backward weight operations.
+  sycl::event async_convolution_backward_weight(
+      convolution_desc &desc, ::dnnl::algorithm alg, float alpha,
+      const memory_desc_ext &src_desc, void *src,
+      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
+      const memory_desc_ext &diff_weight_desc, void *diff_weight);
+
+  /// Computing the bias gradient of a specified convolution function
+  /// asynchronously.
+  /// \param [in] alpha Value to scaling factors used to scale the computed
+  /// value.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] beta Value to scaling factors used to scale the prior value
+  /// in the destination memory.
+  /// \param [in] diff_bias_desc Differential bias memory descriptor.
+  /// \param [out] diff_bias Pointer to differential bias data.
+  /// \returns An event representing the convolution backward bias operations.
+  sycl::event async_convolution_backward_bias(float alpha,
+                                        const memory_desc_ext &diff_dst_desc,
+                                        void *diff_dst, float beta,
+                                        const memory_desc_ext &diff_bias_desc,
+                                        void *diff_bias);
+
+  /// Getting the required weight space size for specified rnn operation.  
+  /// \param [in] desc RNN descriptor.
+  /// \param [out] weight_space_size Size of required weight space.
+  void rnn_get_weight_space_size(const rnn_desc &desc,
+                                 size_t *weight_space_size);
+
+  /// Getting the required scratchpad size and workspace size for specified rnn operation.  
+  /// \param [in] desc RNN descriptor.
+  /// \param [in] kind Propagation kind.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [out] scratchpad_size Size of required scratchpad.
+  /// \param [out] workspace_size Size of required workspace.
+  void rnn_get_scratchpad_workspace_size(const rnn_desc &desc, ::dnnl::prop_kind kind,
+                              const memory_desc_ext &src_desc,
+                              size_t *scratchpad_size, size_t *workspace_size);
+
+  /// Computing a specified rnn function value asynchronously.
+  /// \param [in] desc RNN descriptor.
+  /// \param [in] kind Propagation kind.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [in] iter_desc Recurrent hidden state data memory descriptor.
+  /// \param [in] src_iter Pointer to input recurrent hidden state data.
+  /// \param [in] dst_iter Pointer to output recurrent hidden state data.
+  /// \param [in] iter_c_desc Recurrent cell state data memory descriptor.
+  /// \param [in] src_c_iter Pointer to input recurrent cell state data.
+  /// \param [in] dst_c_iter Pointer to output recurrent cell state data.
+  /// \param [in] weight_size Size of weight memory.
+  /// \param [in] weight Pointer to weight data.
+  /// \param [in] scratchpad_size Size of scratchpad memory.
+  /// \param [in] scratchpad Pointer to scratchpad data.
+  /// \param [in] workspace_size Size of workspace memory.
+  /// \param [in] workspace Pointer to workspace data.
+  /// \returns An event representing the status of rnn forward operations.
+  sycl::event async_rnn_forward(const rnn_desc &desc, ::dnnl::prop_kind kind,
+                               const memory_desc_ext &src_desc, void *src,
+                               const memory_desc_ext &dst_desc, void *dst,
+                               const memory_desc_ext &iter_desc, void *src_iter,
+                               void *dst_iter,
+                               const memory_desc_ext &iter_c_desc,
+                               void *src_iter_c, void *dst_iter_c,
+                               size_t weight_size, void *weight,
+                               size_t scratchpad_size, void *scratchpad,
+                               size_t workspace_size, void *workspace);
+
+  /// Computing the data and weight gradient of a specified rnn function
+  /// asynchronously.
+  /// \param [in] desc RNN descriptor.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [in] dst Pointer to destination data.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \param [in] iter_desc Recurrent hidden state data memory descriptor.
+  /// \param [in] src_iter Pointer to input recurrent hidden state data.
+  /// \param [in] diff_dst_iter Pointer to differential output recurrent hidden state data.
+  /// \param [out] diff_src_iter Pointer to differential input recurrent hidden state data.
+  /// \param [in] iter_c_desc Recurrent cell state data memory descriptor.
+  /// \param [in] src_c_iter Pointer to input recurrent cell state data.
+  /// \param [in] diff_dst_c_iter Pointer to differential output recurrent cell state data.
+  /// \param [out] diff_src_c_iter Pointer to differential input recurrent cell state data.
+  /// \param [in] weight_size Size of weight memory.
+  /// \param [in] weight Pointer to weight data.
+  /// \param [out] diff_weight Pointer to differential weight data.
+  /// \param [in] scratchpad_size Size of scratchpad memory.
+  /// \param [in] scratchpad Pointer to scratchpad data.
+  /// \param [in] workspace_size Size of workspace memory.
+  /// \param [in] workspace Pointer to workspace data.
+  /// \returns An event representing the status of rnn backward operations.
+  sycl::event async_rnn_backward(
+      const rnn_desc &desc, const memory_desc_ext &dst_desc, void *dst,
+      void *diff_dst, const memory_desc_ext &src_desc, void *src,
+      void *diff_src, const memory_desc_ext &iter_desc, void *src_iter,
+      void *diff_dst_iter, void *diff_src_iter,
+      const memory_desc_ext &iter_c_desc, void *src_iter_c,
+      void *diff_dst_iter_c, void *diff_src_iter_c, size_t weight_size,
+      void *weight, void *diff_weight, size_t scratchpad_size, void *scratchpad,
+      size_t workspace_size, void *workspace);
+
+  /// Getting the required state size for specified dropout operation.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \returns Required size of state.
+  size_t get_dropout_state_size();
+
+  /// Getting the required workspace size for dropout operation.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \returns Required size of workspace.
+  static size_t get_dropout_workspace_size(const memory_desc_ext &src_desc);
+
+  /// Computing a specified dropout function value asynchronously.
+  /// \param [in] desc Dropout descriptor.
+  /// \param [in] src_desc Source memory descriptor.
+  /// \param [in] src Pointer to source data.
+  /// \param [in] dst_desc Destination memory descriptor.
+  /// \param [out] dst Pointer to destination data.
+  /// \param [in] workspace Pointer to workspace data.
+  /// \param [in] workspace_size Size of workspace memory.
+  /// \returns An event representing the dropout forward operations.
+  sycl::event async_dropout_forward(dropout_desc &desc,
+                                    const memory_desc_ext &src_desc, void *src,
+                                    const memory_desc_ext &dst_desc, void *dst,
+                                    void *workspace, size_t workspace_size);
+
+  /// Computing the gradient of a specified dropout function asynchronously.
+  /// \param [in] desc Dropout descriptor.
+  /// \param [in] diff_dst_desc Differential destination memory descriptor.
+  /// \param [in] diff_dst Pointer to differential destination data.
+  /// \param [in] diff_src_desc Differential source memory descriptor.
+  /// \param [out] diff_src Pointer to differential source data.
+  /// \param [in] workspace Pointer to workspace data.
+  /// \param [in] workspace_size Size of workspace memory.
+  /// \returns An event representing the dropout backward operations.
+  sycl::event async_dropout_backward(dropout_desc &desc,
+                                     const memory_desc_ext &diff_dst_desc,
+                                     void *diff_dst,
+                                     const memory_desc_ext &diff_src_desc,
+                                     void *diff_src, void *workspace,
+                                     size_t workspace_size);
+};
+
+inline thread_local unsigned int engine_ext::_engine_count;
+inline thread_local detail::primitive_cache engine_ext::_primitive_cache;
+inline thread_local std::map<void *, ::dnnl::memory> engine_ext::_workspace_map;
+inline thread_local std::map<sycl::queue *,
+                             std::shared_ptr<engine_ext::internal_resource>>
+    engine_ext::_internal_resource_cache;
+
+inline
+void dropout_desc::restore(engine_ext &engine, float p, void *state,
+                                  size_t state_size, unsigned long long seed) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
+                             "Interfaces Project does not support this API.");
+#else
+  if (state) {
+    std::int64_t required_state_size = engine.get_dropout_state_size();
+    if (state_size < required_state_size) {
+      throw std::runtime_error("restore: state_size less than required state size.");
+    }
+    sycl::queue *q = engine.get_queue();
+    _imp->_p = p;
+    _imp->_seed = seed;
+    _imp->_state = state;
+    _imp->_host_state = std::vector<std::uint8_t>(required_state_size);
+    q->memcpy(_imp->_host_state.data(), _imp->_state, required_state_size).wait();
+    _imp->_rng_engine =
+        oneapi::mkl::rng::load_state<rng_engine_t>(
+            *q, _imp->_host_state.data());
+  }
+#endif
+}
+
+inline
+void dropout_desc::set(engine_ext &engine, float p, void *state,
+                              size_t state_size, unsigned long long seed) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
+                             "Interfaces Project does not support this API.");
+#else
+  _imp->_p = p;
+  if (state) {
+    std::int64_t required_state_size = engine.get_dropout_state_size();
+    if (state_size < required_state_size) {
+      throw std::runtime_error("set: no sufficient memory to save states.");
+    }
+    sycl::queue *q = engine.get_queue();
+    _imp->_seed = seed;
+    _imp->_state = state;
+    _imp->_host_state = std::vector<std::uint8_t>(required_state_size);
+    _imp->_rng_engine = rng_engine_t(*q, seed);
+    oneapi::mkl::rng::save_state(_imp->_rng_engine, _imp->_host_state.data());
+    q->memcpy(_imp->_state, _imp->_host_state.data(), required_state_size).wait();
+  }
+#endif
+}
+
+inline
+::dnnl::memory::data_type
+memory_desc_ext::to_dnnl_data_type(dpct::library_data_t dt) {
+  using dnnl_dt = ::dnnl::memory::data_type;
+  switch (dt) {
+  case dpct::library_data_t::real_half:
+    return dnnl_dt::f16;
+  case dpct::library_data_t::real_bfloat16:
+    return dnnl_dt::bf16;
+  case dpct::library_data_t::real_float:
+    return dnnl_dt::f32;
+  case dpct::library_data_t::real_double:
+    return dnnl_dt::f64;
+  case dpct::library_data_t::real_int32:
+    return dnnl_dt::s32;
+  case dpct::library_data_t::real_int8:
+    return dnnl_dt::s8;
+  case dpct::library_data_t::real_uint8:
+    return dnnl_dt::u8;
+  case dpct::library_data_t::real_int8_4:
+    return dnnl_dt::s8;
+  case dpct::library_data_t::real_int8_32:
+    return dnnl_dt::s8;
+  case dpct::library_data_t::real_uint8_4:
+    return dnnl_dt::u8;
+  default:
+    throw std::runtime_error("to_dnnl_data_type: unsupported data type.");
+  }
+}
+
+inline
+dpct::library_data_t
+memory_desc_ext::to_dpct_library_data_t(::dnnl::memory::data_type dt,
+                                        unsigned block_size) {
+  using dpct_dt = dpct::library_data_t;
+  using dnnl_dt = ::dnnl::memory::data_type;
+  switch (dt) {
+  case dnnl_dt::f16:
+    return dpct_dt::real_half;
+  case dnnl_dt::bf16:
+    return dpct_dt::real_bfloat16;
+  case dnnl_dt::f32:
+    return dpct_dt::real_float;
+  case dnnl_dt::f64:
+    return dpct_dt::real_double;
+  case dnnl_dt::s32:
+    return dpct_dt::real_int32;
+  case dnnl_dt::s8:
+    if (block_size == 4) {
+      return dpct_dt::real_int8_4;
+    } else if (block_size == 32) {
+      return dpct_dt::real_int8_32;
+    } else {
+      return dpct_dt::real_int8;
+    }
+  case dnnl_dt::u8:
+    if (block_size == 4) {
+      return dpct_dt::real_uint8_4;
+    } else {
+      return dpct_dt::real_uint8;
+    }
+  default:
+    throw std::runtime_error("to_dpct_library_data_t: unsupported data type "
+                             "dnnl::memory::data_type::undef.");
+  }
+}
+
+inline
+::dnnl::memory::format_tag
+memory_desc_ext::to_dnnl_format_tag(dpct::library_data_t dt,
+                                    memory_format_tag tag) {
+  using dpct_dt = dpct::library_data_t;
+  using dpct_tag = memory_format_tag;
+  using dnnl_tag = ::dnnl::memory::format_tag;
+  switch (tag) {
+  case dpct_tag::nchw:
+    return dnnl_tag::nchw;
+  case dpct_tag::nhwc:
+    return dnnl_tag::nhwc;
+  default:
+    if (dt == dpct_dt::real_int8_32) {
+      return dnnl_tag::nChw32c;
+    } else {
+      return dnnl_tag::nChw4c;
+    }
+  }
+}
+
+inline
+void memory_desc_ext::set(memory_format_tag tag, dpct::library_data_t dt, int n,
+                          int c, int h, int w) {
+  _desc = ::dnnl::memory::desc({n, c, h, w}, to_dnnl_data_type(dt),
+                               to_dnnl_format_tag(dt, tag));
+}
+
+inline
+void memory_desc_ext::set(dpct::library_data_t dt, int n, int c, int h, int w,
+                          int n_stride, int c_stride, int h_stride,
+                          int w_stride) {
+  _desc = ::dnnl::memory::desc({n, c, h, w}, to_dnnl_data_type(dt),
+                               {n_stride, c_stride, h_stride, w_stride});
+}
+
+inline
+void memory_desc_ext::set(dpct::library_data_t dt, int ndims, const int dims[],
+                          const int strides[]) {
+  _desc = ::dnnl::memory::desc({dims, dims + ndims}, to_dnnl_data_type(dt),
+                               {strides, strides + ndims});
+}
+
+inline
+void memory_desc_ext::set(memory_format_tag tag, dpct::library_data_t dt,
+                          int ndims, const int dims[]) {
+  _desc = ::dnnl::memory::desc({dims, dims + ndims}, to_dnnl_data_type(dt),
+                               to_dnnl_format_tag(dt, tag));
+}
+
+inline
+void memory_desc_ext::set(rnn_memory_format_tag tag, dpct::library_data_t dt,
+                          int t, int n, int c) {
+  if (tag == rnn_memory_format_tag::tnc) {
+    _desc = ::dnnl::memory::desc({t, n, c}, to_dnnl_data_type(dt),
+                                 ::dnnl::memory::format_tag::tnc);
+  } else if(tag == rnn_memory_format_tag::ntc) {
+    _desc = ::dnnl::memory::desc({t, n, c}, to_dnnl_data_type(dt),
+                                 ::dnnl::memory::format_tag::ntc);
+  } else {
+    throw std::runtime_error("set: unsupported memory format tag.");
+  }
+}
+
+inline
+void memory_desc_ext::get(dpct::library_data_t *dt, int *n, int *c, int *h,
+                          int *w, int *n_stride, int *c_stride, int *h_stride,
+                          int *w_stride) const {
+  unsigned block_size = 1;
+  auto dims = _desc.get_dims();
+  auto inner_blks = _desc.get_inner_blks();
+  auto strides = _desc.get_strides();
+  if (!inner_blks.empty()) {
+    block_size = inner_blks[0];
+  }
+
+  *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size);
+  *n = dims[0];
+  *c = dims[1];
+  *h = dims[2];
+  *w = dims[3];
+  *n_stride = strides[0] / block_size;
+  *c_stride = strides[1] / block_size;
+  *h_stride = strides[2] / block_size;
+  *w_stride = strides[3] / block_size;
+}
+
+inline
+void memory_desc_ext::get(dpct::library_data_t *dt, memory_format_tag *tag,
+                          int *n, int *c, int *h, int *w) const {
+  unsigned block_size = 1;
+  *tag = memory_format_tag::nchw;
+  auto dims = _desc.get_dims();
+  auto strides = _desc.get_strides();
+  auto inner_blks = _desc.get_inner_blks();
+  if (!inner_blks.empty()) {
+    block_size = inner_blks[0];
+    *tag = memory_format_tag::nchw_blocked;
+  }
+  if (strides[1] == 1 && dims[1] != 1) {
+    *tag = memory_format_tag::nhwc;
+  }
+  *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size);
+  *n = dims[0];
+  *c = dims[1];
+  *h = dims[2];
+  *w = dims[3];
+}
+
+inline
+void memory_desc_ext::get(dpct::library_data_t *dt, rnn_memory_format_tag *tag,
+                          int *t, int *n, int *c) const {
+  auto dims = _desc.get_dims();
+  auto strides = _desc.get_strides();
+
+  if (strides[0] >= strides[1]) {
+    *tag = rnn_memory_format_tag::tnc;
+  } else {
+    *tag = rnn_memory_format_tag::ntc;
+  }
+
+  *dt = to_dpct_library_data_t(_desc.get_data_type(), 1);
+  *t = dims[0];
+  *n = dims[1];
+  *c = dims[2];
+}
+
+inline
+void memory_desc_ext::get(int requested_ndims, dpct::library_data_t *dt,
+                          int *ndims, int dims[], int strides[]) const {
+  unsigned block_size = 1;
+  auto inner_blks = _desc.get_inner_blks();
+  auto adims = _desc.get_dims();
+  auto astrides = _desc.get_strides();
+  if (!inner_blks.empty()) {
+    block_size = inner_blks[0];
+  }
+  *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size);
+  *ndims = _desc.get_ndims();
+  for (int index = 0; index < requested_ndims; index++) {
+    dims[index] = adims[index];
+    strides[index] =
+        astrides[index] / block_size;
+  }
+}
+
+inline
+void memory_desc_ext::get(int requested_ndims, dpct::library_data_t *dt,
+                          memory_format_tag *tag, int *ndims,
+                          int dims[]) const {
+  unsigned block_size = 1;
+  *tag = memory_format_tag::nchw;
+  auto inner_blks = _desc.get_inner_blks();
+  auto adims = _desc.get_dims();
+  auto astrides = _desc.get_strides();
+  if (!inner_blks.empty()) {
+    block_size = inner_blks[0];
+    *tag = memory_format_tag::nchw_blocked;
+  }
+  if (astrides[1] == 1 &&
+      adims[1] != 1) {
+    *tag = memory_format_tag::nhwc;
+  }
+  *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size);
+  *ndims = _desc.get_ndims();
+  for (int index = 0; index < requested_ndims; index++) {
+    dims[index] = adims[index];
+  }
+}
+
+inline
+void engine_ext::get_rnn_configuration(const ::dnnl::memory::desc &desc,
+                                       rnn_direction direction, rnn_mode mode,
+                                       dpct::library_data_t dt, int hidden_size,
+                                       ::dnnl::memory::data_type *dnnl_dt,
+                                       ::dnnl::memory::format_tag *tag,
+                                       int *projection_size, int *output_size,
+                                       int *seq_length, int *batch_size,
+                                       int *direction_num, int *gate_num) {
+  if (!desc.is_zero()) {
+    auto dims = desc.get_dims();
+    auto strides = desc.get_strides();
+    if (strides[0] >= strides[1]) {
+      *tag = ::dnnl::memory::format_tag::tnc;
+      *seq_length = dims[0];
+      *batch_size = dims[1];
+    } else {
+      *tag = ::dnnl::memory::format_tag::ntc;
+      *seq_length = dims[1];
+      *batch_size = dims[0];
+    }
+  }
+  if (direction == rnn_direction::bidirectional) {
+    *direction_num = 2;
+  } else {
+    *direction_num = 1;
+  }
+  if (mode == rnn_mode::lstm) {
+    *gate_num = 4;
+  } else if (mode == rnn_mode::gru) {
+    *gate_num = 3;
+  } else {
+    *gate_num = 1;
+  }
+  if (*projection_size != hidden_size) {
+    *output_size = *projection_size;
+  } else {
+    *projection_size = 0;
+    *output_size = hidden_size;
+  }
+  *dnnl_dt = memory_desc_ext::to_dnnl_data_type(dt);
+}
+
+inline
+void *engine_ext::allocate(const memory_desc_ext &data_desc, int count) {
+  return allocate(data_desc.get_size() * count);
+}
+
+inline
+void *engine_ext::allocate(size_t size) {
+  auto &Info = get_internal_resource(_q)->binfo;
+  uint8_t *result = Info.buffer + Info.usage;
+  Info.usage += size;
+  return result;
+}
+
+inline
+void engine_ext::transform_no_zero(const memory_desc_ext &desc, void *src, void *dst) {
+  ::dnnl::memory::data_type dt = desc.get_desc().get_data_type();
+  size_t element_num = desc.get_element_num();
+  switch (dt) {
+  case ::dnnl::memory::data_type::f32:
+    transform_no_zero_with_type<float>(_q, src, dst, element_num);
+    break;
+  case ::dnnl::memory::data_type::f16:
+    transform_no_zero_with_type<sycl::half>(_q, src, dst, element_num);
+    break;
+  case ::dnnl::memory::data_type::s32:
+    transform_no_zero_with_type<int32_t>(_q, src, dst, element_num);
+    break;
+  case ::dnnl::memory::data_type::s8:
+    transform_no_zero_with_type<int8_t>(_q, src, dst, element_num);
+    break;
+  case ::dnnl::memory::data_type::u8:
+    transform_no_zero_with_type<uint8_t>(_q, src, dst, element_num);
+    break;
+  default:
+    throw std::runtime_error("transform_no_zero: unsupported data type.");
+  }
+}
+
+inline
+::dnnl::memory::desc
+engine_ext::get_group_weight_desc(int group_count,
+                                  const memory_desc_ext &weight_desc) {
+  if (group_count == 1) {
+    return weight_desc.get_desc();
+  }
+  auto help_weight_desc = weight_desc.get_desc();
+  int ndims = help_weight_desc.get_ndims();
+  if (!help_weight_desc.get_inner_blks().empty()) {
+    throw std::runtime_error("get_group_weight_desc: group convolution with "
+                             "blocked weight memory unimplemented.");
+  }
+  std::vector<int64_t> new_size;
+  auto old_size = weight_desc.get_dims();
+  new_size.push_back(group_count);
+  new_size.push_back(old_size[0] / group_count);
+  for (int index = 1; index < old_size.size(); index++) {
+    new_size.push_back(old_size[index]);
+  }
+  std::vector<int64_t> strides = help_weight_desc.get_strides();
+  ::dnnl::memory::format_tag tag;
+  bool is_nhwc = (strides[1] == 1 && old_size[1] != 1);
+
+  if (ndims == 4) {
+    if (is_nhwc) {
+      tag = ::dnnl::memory::format_tag::gohwi;
+    } else {
+      tag = ::dnnl::memory::format_tag::goihw;
+    }
+  } else if (ndims == 5) {
+    if (is_nhwc) {
+      tag = ::dnnl::memory::format_tag::godhwi;
+    } else {
+      tag = ::dnnl::memory::format_tag::goidhw;
+    }
+  }
+
+  help_weight_desc =
+      ::dnnl::memory::desc(new_size, weight_desc.get_desc().get_data_type(), tag);
+  return help_weight_desc;
+}
+
+inline
+::dnnl::memory::desc engine_ext::compress_spatial_dimensions_to_channel(
+    const ::dnnl::memory::desc &desc) {
+  int ndims = desc.get_ndims();
+  auto dims = desc.get_dims();
+  auto inner_blks = desc.get_inner_blks();
+  assert(ndims >= 4 && "ndims is at least 4.");
+  std::vector<int64_t> compressed_dims(ndims);
+  compressed_dims[0] = dims[0];
+  compressed_dims[1] = dims[1];
+  for (int index = 2; index < ndims; index++) {
+    compressed_dims[1] = compressed_dims[1] * dims[index];
+    compressed_dims[index] = 1;
+  }
+  if (!inner_blks.empty() && inner_blks[0] == 4) {
+    return ::dnnl::memory::desc(compressed_dims, desc.get_data_type(),
+                                ::dnnl::memory::format_tag::nChw4c);
+  } else if (!inner_blks.empty() && inner_blks[0] == 32) {
+    return ::dnnl::memory::desc(compressed_dims, desc.get_data_type(),
+                                ::dnnl::memory::format_tag::nChw32c);
+  }
+  std::vector<int64_t> strides(ndims, 1);
+  strides[0] = compressed_dims[1];
+
+  return ::dnnl::memory::desc(compressed_dims, desc.get_data_type(), strides);
+}
+
+inline
+::dnnl::memory::desc
+engine_ext::get_bn_scale_bias_mean_var_desc(const ::dnnl::memory::desc &desc,
+                                            batch_normalization_mode mode) {
+  int ndims = desc.get_ndims();
+  auto dims = desc.get_dims();
+  assert(ndims >= 4 && "ndims is at least 4.");
+  int channel_num = 1;
+  if (mode == batch_normalization_mode::spatial) {
+    channel_num = dims[1];
+  } else {
+    for (int index = 1; index < ndims; index++) {
+      channel_num = channel_num * dims[index];
+    }
+  }
+  return ::dnnl::memory::desc({channel_num}, desc.get_data_type(),
+                              ::dnnl::memory::format_tag::a);
+}
+
+inline
+::dnnl::memory::desc engine_ext::transfer_memory_desc_to_channel_major_format(
+    const ::dnnl::memory::desc &desc) {
+  if (!desc.get_inner_blks().empty()) {
+    return desc;
+  }
+  int ndims = desc.get_ndims();
+  auto dims = desc.get_dims();
+  if (ndims == 4) {
+    return ::dnnl::memory::desc(dims, desc.get_data_type(),
+                                ::dnnl::memory::format_tag::nchw);
+  }
+  return ::dnnl::memory::desc(dims, desc.get_data_type(),
+                              ::dnnl::memory::format_tag::ncdhw);
+}
+
+/// If the alpha = 0 and beta = 1, then the destination (dst = alpha * out +
+/// beta * prior_dst) have no change. In this case this function returns true
+/// means the operation can exit directly.
+inline
+bool engine_ext::scale_parameter_preprocess(
+    const std::vector<output_argument_info> &args) {
+  bool direct_exit = true;
+  for (auto &arg : args) {
+    if (arg._alpha == 0.f) {
+      if (arg._beta != 1.f) {
+        async_scale(arg._beta, arg._desc, arg._data);
+      }
+    } else {
+      direct_exit = false;
+    }
+  }
+  return direct_exit;
+}
+
+inline
+void engine_ext::derive_batch_normalization_memory_desc(
+    memory_desc_ext &scale_bias_desc, memory_desc_ext &mean_var_desc,
+    const memory_desc_ext &src_desc, batch_normalization_mode mode) {
+    derive_batch_normalization_memory_desc(scale_bias_desc, src_desc, mode);
+    derive_batch_normalization_memory_desc(mean_var_desc, src_desc, mode);
+}
+
+inline
+void engine_ext::derive_batch_normalization_memory_desc(
+    memory_desc_ext &desc, const memory_desc_ext &src_desc,
+    batch_normalization_mode mode) {
+  int src_ndims = src_desc.get_desc().get_ndims();
+  auto inner_blks = src_desc.get_desc().get_inner_blks();
+  if (src_desc.get_desc().get_ndims() != 4 ||
+      src_desc.get_desc().get_ndims() != 5) {
+    throw std::runtime_error("derive_batch_normalization_memory_desc: only 4d "
+                             "and 5d memory descriptor supported.");
+  }
+  std::vector<int64_t> dims = src_desc.get_dims();
+  dims[0] = 1;
+  if (mode == batch_normalization_mode::spatial) {
+    dims[2] = 1;
+    dims[3] = 1;
+    if (src_ndims == 5) {
+      dims[4] = 1;
+    }
+  }
+  auto data_type = src_desc.get_desc().get_data_type();
+  if (data_type == ::dnnl::memory::data_type::f16) {
+    data_type = ::dnnl::memory::data_type::f32;
+  }
+  if (!inner_blks.empty() && inner_blks[0] == 4) {
+    desc.set_desc(::dnnl::memory::desc(dims, data_type,
+                                       ::dnnl::memory::format_tag::nChw4c));
+  } else if (!inner_blks.empty() && inner_blks[0] == 32) {
+    desc.set_desc(::dnnl::memory::desc(dims, data_type,
+                                       ::dnnl::memory::format_tag::nChw32c));
+  } else {
+    if (src_ndims == 4) {
+      desc.set_desc(::dnnl::memory::desc(dims, data_type,
+                                         ::dnnl::memory::format_tag::nchw));
+    } else {
+      desc.set_desc(::dnnl::memory::desc(dims, data_type,
+                                         ::dnnl::memory::format_tag::ncdhw));
+    }
+  }
+}
+
+template <typename primitive_type>
+sycl::event engine_ext::execute_primitive(
+    const std::pair<detail::primitive_cache_key_type, detail::primitive_and_args>
+        &primitive,
+    const std::vector<output_argument_info> &output_args) {
+  std::vector<void *> caches;
+  int output_arg_num = output_args.size();
+  for (int i = 0; i < output_arg_num; i++) {
+    if (output_args[i]._beta != 0.f) {
+      auto cache = allocate(output_args[i]._desc);
+      caches.push_back(cache);
+      (*primitive.second.args)[output_args[i]._name].set_data_handle(cache);
+    }
+  }
+
+  auto e = ::dnnl::sycl_interop::execute(
+      *(static_cast<primitive_type *>(primitive.second.primitive)), *_s,
+      *primitive.second.args);
+  _primitive_cache.put(
+      primitive.first, primitive.second.primitive, primitive.second.args,
+      [](::dnnl::primitive *p) { delete static_cast<primitive_type *>(p); }, e,
+      _q);
+  int cache_index = 0;
+  for (int i = 0; i < output_arg_num; i++) {
+    if (output_args[i]._beta != 0.f) {
+      e = async_sum(output_args[i]._alpha, output_args[i]._desc,
+                    caches[cache_index++], output_args[i]._beta,
+                    output_args[i]._desc, output_args[i]._data);
+    } else {
+      if (output_args[i]._alpha != 1.f) {
+        e = async_scale(output_args[i]._alpha, output_args[i]._desc,
+                        output_args[i]._data);
+      }
+    }
+  }
+  return e;
+}
+
+inline
+::dnnl::memory::desc engine_ext::bn_reorder_memory_to_channel_major_format(
+    bool is_input, ::dnnl::memory::desc &desc, void *src, void **cache) {
+  ::dnnl::memory::desc result;
+  result = transfer_memory_desc_to_channel_major_format(desc);
+  if ((result != desc) || !src) {
+    *cache = allocate(desc);
+    if (is_input && src) {
+      async_reorder(1.f, desc, src, 0.f, result, *cache);
+    }
+  }
+  return result;
+}
+
+inline
+sycl::event engine_ext::batch_normalization_backward_internal(
+    batch_normalization_mode mode, float epsilon, float alpha_data,
+    const memory_desc_ext &src_desc, void *src,
+    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
+    const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param,
+    const memory_desc_ext &diff_scale_bias_desc, void *scale, void *bias,
+    float beta_param, void *diff_scale, void *diff_bias,
+    const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var) {
+  if (scale_parameter_preprocess(
+          {{alpha_data, beta_data, diff_src_desc, diff_src},
+           {alpha_param, beta_param, diff_scale_bias_desc, diff_scale},
+           {alpha_param, beta_param, diff_scale_bias_desc, diff_bias}})) {
+    return sycl::event();
+  }
+
+  void *reordered_src = nullptr, *reordered_diff_dst = nullptr,
+       *reordered_diff_src = nullptr, *reordered_scale = nullptr,
+       *reordered_bias = nullptr, *reordered_diff_scale = nullptr,
+       *reordered_diff_bias = nullptr, *reordered_saved_mean = nullptr,
+       *reordered_saved_var = nullptr;
+
+  ::dnnl::memory::desc help_src_desc = src_desc.get_desc();
+  ::dnnl::memory::desc help_diff_dst_desc = diff_dst_desc.get_desc();
+  ::dnnl::memory::desc help_diff_src_desc = diff_src_desc.get_desc();
+  ::dnnl::memory::desc help_diff_scale_bias_desc =
+      diff_scale_bias_desc.get_desc();
+  ::dnnl::memory::desc help_mean_var_desc = mean_var_desc.get_desc();
+  ::dnnl::memory::desc actual_diff_src_desc = help_diff_src_desc;
+  ::dnnl::memory::desc actual_diff_scale_bias_desc = help_diff_scale_bias_desc;
+  enter_primitive(
+      help_diff_scale_bias_desc.get_size() * 14 + help_src_desc.get_size() * 2 +
+      help_diff_dst_desc.get_size() * 7 + help_diff_src_desc.get_size() * 5 +
+      help_mean_var_desc.get_size() * 13);
+  if (mode == batch_normalization_mode::per_activation) {
+    help_src_desc = bn_reorder_memory_to_channel_major_format(true, help_src_desc, src,
+                                                       &reordered_src);
+    help_diff_dst_desc = bn_reorder_memory_to_channel_major_format(
+        true, help_diff_dst_desc, diff_dst, &reordered_diff_dst);
+    help_diff_src_desc = bn_reorder_memory_to_channel_major_format(
+        false, help_diff_src_desc, diff_src, &reordered_diff_src);
+    actual_diff_src_desc = help_diff_src_desc;
+    help_diff_scale_bias_desc = bn_reorder_memory_to_channel_major_format(
+        true, help_diff_scale_bias_desc, scale, &reordered_scale);
+    actual_diff_scale_bias_desc = help_diff_scale_bias_desc;
+    if (bias) {
+      bn_reorder_memory_to_channel_major_format(true, help_diff_scale_bias_desc, bias,
+                                         &reordered_bias);
+    }
+    bn_reorder_memory_to_channel_major_format(false, help_diff_scale_bias_desc,
+                                       diff_scale, &reordered_diff_scale);
+    bn_reorder_memory_to_channel_major_format(false, help_diff_scale_bias_desc,
+                                       diff_bias, &reordered_diff_bias);
+
+    help_mean_var_desc = bn_reorder_memory_to_channel_major_format(
+        true, help_mean_var_desc, saved_mean, &reordered_saved_mean);
+    bn_reorder_memory_to_channel_major_format(true, help_mean_var_desc, saved_var,
+                                       &reordered_saved_var);
+    help_src_desc = compress_spatial_dimensions_to_channel(help_src_desc);
+    help_diff_src_desc =
+        compress_spatial_dimensions_to_channel(help_diff_src_desc);
+    help_diff_dst_desc =
+        compress_spatial_dimensions_to_channel(help_diff_dst_desc);
+  } else {
+    if ((help_src_desc != help_diff_dst_desc) ||
+        (help_src_desc != help_diff_src_desc) ||
+        (help_diff_dst_desc != help_diff_src_desc)) {
+      help_src_desc = bn_reorder_memory_to_channel_major_format(
+          true, help_src_desc, src, &reordered_src);
+      help_diff_dst_desc = bn_reorder_memory_to_channel_major_format(
+          true, help_diff_dst_desc, diff_dst, &reordered_diff_dst);
+      help_diff_src_desc = bn_reorder_memory_to_channel_major_format(
+          false, help_diff_src_desc, diff_src, &reordered_diff_src);
+      actual_diff_src_desc = help_diff_src_desc;
+    }
+  }
+
+  help_diff_scale_bias_desc =
+      get_bn_scale_bias_mean_var_desc(help_diff_scale_bias_desc, mode);
+  help_mean_var_desc =
+      get_bn_scale_bias_mean_var_desc(help_mean_var_desc, mode);
+
+  auto forward_primitive =
+      create_primitive_desc<::dnnl::batch_normalization_forward>(
+          ::dnnl::prop_kind::forward_training, help_src_desc,
+          help_diff_dst_desc, epsilon,
+          ::dnnl::normalization_flags::use_scale |
+              ::dnnl::normalization_flags::use_shift);
+  auto primitive_args =
+      create_primitive_args_or_get<::dnnl::batch_normalization_backward>(
+          ::dnnl::prop_kind::backward, help_diff_src_desc, help_diff_dst_desc,
+          help_src_desc, epsilon,
+          ::dnnl::normalization_flags::use_scale |
+              ::dnnl::normalization_flags::use_shift, forward_primitive);
+
+  void *dst_cache = nullptr;
+  if (!saved_mean && !saved_var) {
+    dst_cache = allocate(diff_dst_desc);
+    if (!reordered_saved_mean) {
+      reordered_saved_mean = allocate(mean_var_desc);
+    }
+    if (!reordered_saved_var) {
+      reordered_saved_var = allocate(mean_var_desc);
+    }
+    if (!bias) {
+      _q->fill(reordered_bias, 0, diff_scale_bias_desc.get_size());
+    }
+
+    batch_normalization_forward_internal(
+        true, mode, epsilon, 0.f, 1.f, src_desc, src, 0.f, diff_dst_desc,
+        dst_cache, diff_scale_bias_desc, scale, bias ? bias : reordered_bias,
+        mean_var_desc, reordered_saved_mean, reordered_saved_var, nullptr,
+        nullptr);
+  }
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, help_src_desc,
+             reordered_src ? reordered_src : src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_SCALE,
+             help_diff_scale_bias_desc,
+             reordered_scale ? reordered_scale : scale);
+  insert_arg(primitive_args.second.args, DNNL_ARG_MEAN, help_mean_var_desc,
+             reordered_saved_mean ? reordered_saved_mean : saved_mean);
+  insert_arg(primitive_args.second.args, DNNL_ARG_VARIANCE, help_mean_var_desc,
+             reordered_saved_var ? reordered_saved_var : saved_var);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, help_diff_src_desc,
+             reordered_diff_dst ? reordered_diff_dst : diff_dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, help_diff_src_desc,
+             reordered_diff_src ? reordered_diff_src : diff_src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SCALE,
+             help_diff_scale_bias_desc,
+             reordered_diff_scale ? reordered_diff_scale : diff_scale);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SHIFT,
+             help_diff_scale_bias_desc,
+             reordered_diff_bias ? reordered_diff_bias : diff_bias);
+
+  sycl::event e = execute_primitive<::dnnl::batch_normalization_backward>(
+      primitive_args,
+      {{alpha_data, beta_data, DNNL_ARG_DIFF_SRC, help_diff_src_desc,
+        reordered_diff_src ? reordered_diff_src : diff_src},
+       {alpha_param, beta_param, DNNL_ARG_DIFF_SCALE, help_diff_scale_bias_desc,
+        reordered_diff_scale ? reordered_diff_scale : diff_scale},
+       {alpha_param, beta_param, DNNL_ARG_DIFF_SHIFT, help_diff_scale_bias_desc,
+        reordered_diff_bias ? reordered_diff_bias : diff_bias}});
+  if (actual_diff_src_desc != diff_src_desc.get_desc() && reordered_diff_src) {
+    e = async_reorder(1.f, actual_diff_src_desc, reordered_diff_src, 0.f,
+                diff_src_desc, diff_src);
+  }
+  if (actual_diff_scale_bias_desc != diff_scale_bias_desc.get_desc() &&
+      reordered_diff_scale && reordered_diff_bias) {
+    async_reorder(1.f, actual_diff_scale_bias_desc, reordered_diff_scale, 0.f,
+            diff_scale_bias_desc, diff_scale);
+    e = async_reorder(1.f, actual_diff_scale_bias_desc, reordered_diff_bias, 0.f,
+                diff_scale_bias_desc, diff_bias);
+  }
+  return exit_primitive(e);
+}
+
+inline
+sycl::event engine_ext::batch_normalization_forward_internal(
+    bool is_infer, batch_normalization_mode mode, float epsilon, float factor,
+    float alpha, const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &dst_desc, void *dst,
+    const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
+    const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var,
+    void *running_mean, void *running_var) {
+  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
+    return sycl::event();
+  }
+  enter_primitive(src_desc.get_size() + 5 * dst_desc.get_size() +
+                  scale_bias_desc.get_size() * 2 +
+                  mean_var_desc.get_size() * 9);
+  void *reordered_src = nullptr, *reordered_dst = nullptr,
+       *reordered_scale = nullptr, *reordered_bias = nullptr,
+       *reordered_saved_mean = nullptr, *reordered_saved_var = nullptr;
+  ::dnnl::memory::desc help_src_desc = src_desc.get_desc();
+  ::dnnl::memory::desc help_dst_desc = dst_desc.get_desc();
+  ::dnnl::memory::desc help_scale_bias_desc = scale_bias_desc.get_desc();
+  ::dnnl::memory::desc help_mean_var_desc = mean_var_desc.get_desc();
+  ::dnnl::memory::desc actual_dst_desc = help_dst_desc;
+  ::dnnl::memory::desc actual_mean_var_desc = help_mean_var_desc;
+
+  if (mode == batch_normalization_mode::per_activation) {
+    help_src_desc = bn_reorder_memory_to_channel_major_format(true, help_src_desc, src,
+                                                       &reordered_src);
+    help_dst_desc = bn_reorder_memory_to_channel_major_format(
+        false, help_dst_desc, dst, &reordered_dst);
+    actual_dst_desc = help_dst_desc;
+    help_scale_bias_desc = bn_reorder_memory_to_channel_major_format(
+        true, help_scale_bias_desc, scale, &reordered_scale);
+    bn_reorder_memory_to_channel_major_format(true, help_scale_bias_desc, bias,
+                                       &reordered_bias);
+    help_mean_var_desc = bn_reorder_memory_to_channel_major_format(
+        is_infer, help_mean_var_desc, saved_mean,
+        &reordered_saved_mean);
+    actual_mean_var_desc = help_mean_var_desc;
+    bn_reorder_memory_to_channel_major_format(is_infer,
+                                       help_mean_var_desc, saved_var,
+                                       &reordered_saved_var);
+    help_src_desc = compress_spatial_dimensions_to_channel(help_src_desc);
+    help_dst_desc = compress_spatial_dimensions_to_channel(help_dst_desc);
+  } else {
+    if (help_src_desc != help_dst_desc) {
+      help_src_desc = bn_reorder_memory_to_channel_major_format(
+          true, help_src_desc, src, &reordered_src);
+      help_dst_desc = bn_reorder_memory_to_channel_major_format(
+          false, help_dst_desc, dst, &reordered_dst);
+      actual_dst_desc = help_dst_desc;
+    }
+  }
+  help_scale_bias_desc =
+      get_bn_scale_bias_mean_var_desc(help_scale_bias_desc, mode);
+  help_mean_var_desc =
+      get_bn_scale_bias_mean_var_desc(help_mean_var_desc, mode);
+
+  ::dnnl::prop_kind kind;
+  ::dnnl::normalization_flags flag = ::dnnl::normalization_flags::use_scale |
+                                     ::dnnl::normalization_flags::use_shift;
+  if (is_infer) {
+    kind = ::dnnl::prop_kind::forward_inference;
+    flag = ::dnnl::normalization_flags::use_global_stats | flag;
+  } else {
+    kind = ::dnnl::prop_kind::forward_training;
+  }
+  auto primitive_args =
+      create_primitive_args_or_get<::dnnl::batch_normalization_forward>(
+          kind, help_src_desc, help_dst_desc, epsilon, flag);
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, help_src_desc,
+             reordered_src ? reordered_src : src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_SCALE, help_scale_bias_desc,
+             reordered_scale ? reordered_scale : scale);
+  insert_arg(primitive_args.second.args, DNNL_ARG_SHIFT, help_scale_bias_desc,
+             reordered_bias ? reordered_bias : bias);
+  insert_arg(primitive_args.second.args, DNNL_ARG_MEAN, help_mean_var_desc,
+             reordered_saved_mean ? reordered_saved_mean
+                                            : saved_mean);
+  insert_arg(primitive_args.second.args, DNNL_ARG_VARIANCE, help_mean_var_desc,
+             reordered_saved_var ? reordered_saved_var
+                                           : saved_var);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, help_dst_desc,
+             reordered_dst ? reordered_dst : dst);
+  sycl::event e = execute_primitive<::dnnl::batch_normalization_forward>(primitive_args,
+                                    {{alpha, beta, DNNL_ARG_DST, help_dst_desc,
+                                      reordered_dst ? reordered_dst : dst}});
+
+  if (!is_infer && running_var) {
+    auto src_ndim = src_desc.get_desc().get_ndims();
+    auto src_dims = src_desc.get_dims();
+    int element_num = src_dims[0];
+    if (mode == batch_normalization_mode::spatial) {
+      for (int index = 2; index < src_ndim; index++) {
+        element_num *= src_dims[index];
+      }
+    }
+    float unbias_factor = element_num / (element_num - 1.f);
+    async_scale(1.f - factor, mean_var_desc, running_var);
+    e = async_sum(factor * unbias_factor, mean_var_desc,
+            reordered_saved_var ? reordered_saved_var : saved_var,
+            1.f, mean_var_desc, running_var);
+  }
+  if (!is_infer && running_mean) {
+    e = async_sum(factor, mean_var_desc,
+            reordered_saved_mean ? reordered_saved_mean : saved_mean,
+            (1.f - factor), mean_var_desc, running_mean);
+  }
+  if (reordered_dst && (actual_dst_desc != dst_desc.get_desc())) {
+    e = async_reorder(1.f, actual_dst_desc, reordered_dst, 0.f, dst_desc, dst);
+  }
+  if (!is_infer && reordered_saved_mean && reordered_saved_var && saved_mean &&
+      saved_var && (actual_mean_var_desc != mean_var_desc.get_desc())) {
+    e = async_reorder(1.f, actual_mean_var_desc, reordered_saved_mean, 0.f,
+                mean_var_desc, saved_mean);
+    e = async_reorder(1.f, actual_mean_var_desc, reordered_saved_var, 0.f,
+                mean_var_desc, saved_var);
+  }
+  return exit_primitive(e);
+}
+
+inline
+sycl::event engine_ext::rnn_forward_internal(
+    const rnn_desc &desc, ::dnnl::prop_kind kind,
+    const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
+    void *dst, const memory_desc_ext &iter_desc, void *src_iter, void *dst_iter,
+    const memory_desc_ext &iter_c_desc, void *src_iter_c, void *dst_iter_c,
+    size_t weight_size, void *weight, size_t workspace_size, void *workspace,
+    size_t scratchpad_size, void *scratchpad, bool is_get_execution_args,
+    size_t *weight_size_query, size_t *workspace_size_query,
+    size_t *scratchpad_size_query) {
+  ::dnnl::memory::data_type src_dt;
+  ::dnnl::memory::format_tag src_format_tag;
+  rnn_mode mode;
+  rnn_bias_mode bias_mode;
+  rnn_direction direction;
+  dpct::library_data_t dt;
+  int direction_num = 1, input_size = 0, hidden_size = 0, projection_size = 0,
+      layer_size = 0, gate_num = 1, output_size = 0, data_type_size = 0,
+      seq_length = 1, batch_size = 1;
+  std::vector<void *> data = {src,        dst,        src_iter, dst_iter,
+                              src_iter_c, dst_iter_c, weight,   workspace,
+                              scratchpad};
+  std::vector<int> offset(6, 0);
+  void *input_layer_cache = nullptr, *hidden_layer_cache = nullptr;
+  sycl::event e;
+  enter_primitive(src_desc.get_size() * 2);
+  desc.get(&mode, &bias_mode, &direction, &dt, &input_size, &hidden_size,
+           &projection_size, &layer_size);
+
+  get_rnn_configuration(src_desc.get_desc(), direction, mode, dt, hidden_size,
+                        &src_dt, &src_format_tag, &projection_size,
+                        &output_size, &seq_length, &batch_size, &direction_num,
+                        &gate_num);
+
+  if (direction == rnn_direction::bidirectional) {
+    // Here to combine the oneDNN bidirectional_sum and 
+    // bidirectional_concat config, so call execute_rnn_forward_primitive
+    // twice.
+    if (layer_size > 1) {
+      if (!is_get_execution_args) {
+        input_layer_cache = allocate(src_desc);
+        hidden_layer_cache = allocate(src_desc);
+        _q->memcpy(input_layer_cache, src, src_desc.get_size());
+      }
+      data[0] = input_layer_cache;
+      data[1] = hidden_layer_cache;
+      e = execute_rnn_forward_primitive(
+          mode, kind, ::dnnl::rnn_direction::bidirectional_sum, bias_mode,
+          src_dt, src_format_tag, seq_length, batch_size, output_size,
+          output_size, 1, direction_num, hidden_size, gate_num, projection_size,
+          data, offset, layer_size - 1, weight_size_query, workspace_size_query,
+          scratchpad_size_query);
+      data[0] =
+          ((layer_size - 1) % 2 == 0) ? input_layer_cache : hidden_layer_cache;
+      data[1] = dst;
+    }
+    e = execute_rnn_forward_primitive(
+        mode, kind, ::dnnl::rnn_direction::bidirectional_concat, bias_mode,
+        src_dt, src_format_tag, seq_length, batch_size, output_size,
+        2 * output_size, 1, direction_num, hidden_size, gate_num,
+        projection_size, data, offset, 1, weight_size_query,
+        workspace_size_query, scratchpad_size_query);
+  } else {
+    e = execute_rnn_forward_primitive(
+        mode, kind, ::dnnl::rnn_direction::unidirectional_left2right, bias_mode,
+        src_dt, src_format_tag, seq_length, batch_size, output_size,
+        output_size, layer_size, direction_num, hidden_size, gate_num,
+        projection_size, data, offset, 1, weight_size_query,
+        workspace_size_query, scratchpad_size_query);
+  }
+
+  return exit_primitive(e);
+}
+
+inline
+sycl::event engine_ext::execute_rnn_forward_primitive(
+    rnn_mode mode, ::dnnl::prop_kind kind, ::dnnl::rnn_direction direction,
+    rnn_bias_mode bias_mode, ::dnnl::memory::data_type dt,
+    ::dnnl::memory::format_tag tag, int seq_length, int batch_size, int src_c,
+    int dst_c, int layer_size, int direction_num, int hidden_size, int gate_num,
+    int projection_size, std::vector<void *> &data, std::vector<int> &offset,
+    int iter_num, size_t *weight_size, size_t *workspace_size,
+    size_t *scratchpad_size) {
+
+  sycl::event e;
+  ::dnnl::primitive *p = nullptr;
+  std::unordered_map<int, ::dnnl::memory> *args = nullptr;
+  detail::primitive_cache_key_type key;
+  std::unordered_map<int, ::dnnl::memory> *execution_args;
+  ::dnnl::memory::desc bias_desc(
+      {layer_size, direction_num, gate_num, hidden_size}, dt,
+      ::dnnl::memory::format_tag::ldgo);
+  ::dnnl::memory::desc weight_layer_desc(
+      {layer_size, direction_num,
+       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
+      dt, ::dnnl::memory::format_tag::ldigo);
+  ::dnnl::memory::desc weight_iter_desc(
+      {layer_size, direction_num,
+       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
+      dt, ::dnnl::memory::format_tag::ldigo);
+  ::dnnl::memory::desc projection_desc;
+  if (projection_size) {
+    projection_desc = ::dnnl::memory::desc(
+        {layer_size, direction_num, hidden_size, projection_size}, dt,
+        ::dnnl::memory::format_tag::ldio);
+  }
+
+  if (weight_size) {
+    *weight_size +=
+        (weight_layer_desc.get_size() + weight_iter_desc.get_size() +
+         projection_desc.get_size() + bias_desc.get_size()) *
+        iter_num;
+    return e;
+  }
+
+  ::dnnl::memory::desc src_desc({seq_length, batch_size, src_c}, dt, tag);
+  ::dnnl::memory::desc dst_desc({seq_length, batch_size, dst_c}, dt, tag);
+  ::dnnl::memory::desc iter_desc(
+      {layer_size, direction_num, batch_size,
+       projection_size ? projection_size : hidden_size},
+      dt, ::dnnl::memory::format_tag::ldnc);
+  ::dnnl::memory::desc iter_c_desc(
+      {layer_size, direction_num, batch_size, hidden_size}, dt,
+      ::dnnl::memory::format_tag::ldnc);
+
+  ::dnnl::memory::desc workspace_desc;
+  ::dnnl::memory::desc scratchpad_desc;
+  ::dnnl::primitive_attr attr;
+  attr.set_scratchpad_mode(::dnnl::scratchpad_mode::user);
+
+  if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) {
+    auto primitive = create_primitive_args_or_get<::dnnl::vanilla_rnn_forward>(
+        kind,
+        mode == rnn_mode::vanilla_relu ? ::dnnl::algorithm::eltwise_relu
+                                       : ::dnnl::algorithm::eltwise_tanh,
+        direction, src_desc, iter_desc, weight_layer_desc, weight_iter_desc,
+        bias_desc, dst_desc, iter_desc, attr);
+
+    auto pd = get_primitive_desc<::dnnl::vanilla_rnn_forward>(
+        primitive.second.primitive);
+
+    workspace_desc = pd.workspace_desc();
+    scratchpad_desc = pd.scratchpad_desc();
+    if (workspace_size && scratchpad_size) {
+      *workspace_size += workspace_desc.get_size() * iter_num;
+      *scratchpad_size = scratchpad_desc.get_size() > *scratchpad_size
+                             ? scratchpad_desc.get_size()
+                             : *scratchpad_size;
+    } else {
+      key = primitive.first;
+      p = primitive.second.primitive;
+      args = primitive.second.args;
+    }
+  } else if (mode == rnn_mode::gru) {
+    auto primitive = create_primitive_args_or_get<::dnnl::gru_forward>(
+        kind, direction, src_desc, iter_desc, weight_layer_desc,
+        weight_iter_desc, bias_desc, dst_desc, iter_desc, attr);
+
+    auto pd =
+        get_primitive_desc<::dnnl::gru_forward>(primitive.second.primitive);
+
+    workspace_desc = pd.workspace_desc();
+    scratchpad_desc = pd.scratchpad_desc();
+    if (workspace_size && scratchpad_size) {
+      *workspace_size += workspace_desc.get_size() * iter_num;
+      *scratchpad_size = scratchpad_desc.get_size() > *scratchpad_size
+                             ? scratchpad_desc.get_size()
+                             : *scratchpad_size;
+    } else {
+      key = primitive.first;
+      p = primitive.second.primitive;
+      args = primitive.second.args;
+    }
+  } else if (mode == rnn_mode::lstm) {
+    auto primitive = create_primitive_args_or_get<::dnnl::lstm_forward>(
+        kind, direction, src_desc, iter_desc, iter_c_desc, weight_layer_desc,
+        weight_iter_desc, ::dnnl::memory::desc(), projection_desc, bias_desc,
+        dst_desc, iter_desc, iter_c_desc, attr);
+
+    auto pd =
+        get_primitive_desc<::dnnl::lstm_forward>(primitive.second.primitive);
+
+    workspace_desc = pd.workspace_desc();
+    scratchpad_desc = pd.scratchpad_desc();
+    if (workspace_size && scratchpad_size) {
+      *workspace_size += workspace_desc.get_size() * iter_num;
+      *scratchpad_size = scratchpad_desc.get_size() > *scratchpad_size
+                             ? scratchpad_desc.get_size()
+                             : *scratchpad_size;
+    } else {
+      key = primitive.first;
+      p = primitive.second.primitive;
+      args = primitive.second.args;
+    }
+  }
+
+  for (int i = 0; i < iter_num; i++) {
+    void *in_cache = data[0], *out_cache = data[1], *dst_iter_c_cache = nullptr,
+         *dst_iter_cache = ((uint8_t *)(data[3]) + offset[1]);
+    if (mode == rnn_mode::lstm) {
+      dst_iter_c_cache = (uint8_t *)(data[4]) + offset[2];
+    }
+    if (!workspace_size) {
+      insert_arg(args, DNNL_ARG_SRC_LAYER, src_desc, data[0]);
+      insert_arg(args, DNNL_ARG_DST_LAYER, dst_desc, data[1]);
+      insert_arg(args, DNNL_ARG_SCRATCHPAD, scratchpad_desc, data[8]);
+      auto insert_rnn_arg = [&](int arg_name, ::dnnl::memory::desc &d, void *data,
+                             int &offset) {
+        insert_arg(args, arg_name, d, (uint8_t *)data + offset);
+        offset += d.get_size();
+      };
+      insert_rnn_arg(DNNL_ARG_SRC_ITER, iter_desc, data[2], offset[0]);
+      insert_rnn_arg(DNNL_ARG_DST_ITER, iter_desc, data[3], offset[1]);
+
+      if (mode == rnn_mode::lstm) {
+        insert_rnn_arg(DNNL_ARG_SRC_ITER_C, iter_c_desc, data[4], offset[2]);
+        insert_rnn_arg(DNNL_ARG_DST_ITER_C, iter_c_desc, data[5], offset[3]);
+      }
+      insert_rnn_arg(DNNL_ARG_WEIGHTS_LAYER, weight_layer_desc, data[6],
+                  offset[4]);
+      insert_rnn_arg(DNNL_ARG_WEIGHTS_ITER, weight_iter_desc, data[6], offset[4]);
+      if (projection_size) {
+        insert_rnn_arg(DNNL_ARG_WEIGHTS_PROJECTION, projection_desc, data[6],
+                    offset[4]);
+      }
+      if (bias_mode == rnn_bias_mode::none) {
+        _q->memset((uint8_t *)(data[6]) + offset[4], 0, bias_desc.get_size());
+      }
+      insert_rnn_arg(DNNL_ARG_BIAS, bias_desc, data[6], offset[4]);
+      if (kind == ::dnnl::prop_kind::forward_training) {
+        insert_rnn_arg(DNNL_ARG_WORKSPACE, workspace_desc, data[7], offset[5]);
+      }
+      if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) {
+        execute_primitive<::dnnl::vanilla_rnn_forward>(
+            {key, {static_cast<::dnnl::vanilla_rnn_forward *>(p), args}});
+      } else if (mode == rnn_mode::gru) {
+        execute_primitive<::dnnl::gru_forward>(
+            {key, {static_cast<::dnnl::gru_forward *>(p), args}});
+      } else if (mode == rnn_mode::lstm) {
+        execute_primitive<::dnnl::lstm_forward>(
+            {key, {static_cast<::dnnl::lstm_forward *>(p), args}});
+      }
+      if (i != iter_num - 1) {
+        std::swap(data[0], data[1]);
+      }
+    }
+    if (kind == ::dnnl::prop_kind::forward_training) {
+      if (workspace_size) {
+        *workspace_size +=
+            (src_desc.get_size() + dst_desc.get_size() + iter_desc.get_size());
+        if (mode == rnn_mode::lstm) {
+          *workspace_size += iter_c_desc.get_size();
+        }
+      } else {
+        _q->memcpy((uint8_t *)(data[7]) + offset[5], in_cache,
+                   src_desc.get_size());
+        offset[5] += src_desc.get_size();
+        _q->memcpy((uint8_t *)(data[7]) + offset[5], out_cache,
+                   dst_desc.get_size());
+        offset[5] += dst_desc.get_size();
+        _q->memcpy((uint8_t *)(data[7]) + offset[5], dst_iter_cache,
+                   iter_desc.get_size());
+        offset[5] += iter_desc.get_size();
+        if (mode == rnn_mode::lstm) {
+          _q->memcpy((uint8_t *)(data[7]) + offset[5], dst_iter_c_cache,
+                     iter_c_desc.get_size());
+          offset[5] += iter_c_desc.get_size();
+        }
+      }
+    }
+  }
+  return e;
+}
+
+inline
+sycl::event engine_ext::execute_rnn_backward_primitive(
+    rnn_mode mode, ::dnnl::rnn_direction direction, rnn_bias_mode bias_mode,
+    ::dnnl::memory::data_type dt, ::dnnl::memory::format_tag tag,
+    int seq_length, int batch_size, int src_c, int dst_c, int layer_size,
+    int direction_num, int hidden_size, int gate_num, int projection_size,
+    std::vector<void *> &data, std::vector<int> &offset, int iter_num) {
+
+  sycl::event e;
+  ::dnnl::primitive *p = nullptr;
+  std::unordered_map<int, ::dnnl::memory> *args = nullptr;
+  detail::primitive_cache_key_type key;
+  ::dnnl::prop_kind fkind = ::dnnl::prop_kind::forward_training;
+  ::dnnl::prop_kind bkind = ::dnnl::prop_kind::backward;
+  ::dnnl::memory::desc bias_desc(
+      {layer_size, direction_num, gate_num, hidden_size}, dt,
+      ::dnnl::memory::format_tag::ldgo);
+  ::dnnl::memory::desc weight_layer_desc(
+      {layer_size, direction_num,
+       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
+      dt, ::dnnl::memory::format_tag::ldigo);
+  ::dnnl::memory::desc weight_iter_desc(
+      {layer_size, direction_num,
+       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
+      dt, ::dnnl::memory::format_tag::ldigo);
+  ::dnnl::memory::desc diff_weight_layer_desc(
+      {layer_size, direction_num,
+       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
+      dt, ::dnnl::memory::format_tag::ldgoi);
+  ::dnnl::memory::desc diff_weight_iter_desc(
+      {layer_size, direction_num,
+       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
+      dt, ::dnnl::memory::format_tag::ldgoi);
+  ::dnnl::memory::desc projection_desc, diff_projection_desc;
+  if (projection_size) {
+    projection_desc = ::dnnl::memory::desc(
+        {layer_size, direction_num, hidden_size, projection_size}, dt,
+        ::dnnl::memory::format_tag::ldio);
+    diff_projection_desc = ::dnnl::memory::desc(
+        {layer_size, direction_num, hidden_size, projection_size}, dt,
+        ::dnnl::memory::format_tag::ldoi);
+  }
+
+  ::dnnl::memory::desc src_desc({seq_length, batch_size, src_c}, dt, tag);
+  ::dnnl::memory::desc dst_desc({seq_length, batch_size, dst_c}, dt, tag);
+  ::dnnl::memory::desc iter_desc(
+      {layer_size, direction_num, batch_size,
+       projection_size ? projection_size : hidden_size},
+      dt, ::dnnl::memory::format_tag::ldnc);
+  ::dnnl::memory::desc iter_c_desc(
+      {layer_size, direction_num, batch_size, hidden_size}, dt,
+      ::dnnl::memory::format_tag::ldnc);
+
+  ::dnnl::memory::desc workspace_desc;
+  ::dnnl::memory::desc scratchpad_desc;
+  ::dnnl::primitive_attr attr;
+  attr.set_scratchpad_mode(::dnnl::scratchpad_mode::user);
+
+  if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) {
+    auto fpd = create_primitive_desc<::dnnl::vanilla_rnn_forward>(
+        fkind,
+        mode == rnn_mode::vanilla_relu ? ::dnnl::algorithm::eltwise_relu
+                                       : ::dnnl::algorithm::eltwise_tanh,
+        direction, src_desc, iter_desc, weight_layer_desc, weight_iter_desc,
+        bias_desc, dst_desc, iter_desc, attr);
+    auto primitive = create_primitive_args_or_get<::dnnl::vanilla_rnn_backward>(
+        bkind,
+        mode == rnn_mode::vanilla_relu ? ::dnnl::algorithm::eltwise_relu
+                                       : ::dnnl::algorithm::eltwise_tanh,
+        direction, src_desc, iter_desc, diff_weight_layer_desc,
+        diff_weight_iter_desc, bias_desc, dst_desc, iter_desc, src_desc,
+        iter_desc, weight_layer_desc, weight_iter_desc, bias_desc, dst_desc,
+        iter_desc, fpd, attr);
+    auto pd = get_primitive_desc<::dnnl::vanilla_rnn_backward>(
+        primitive.second.primitive);
+    workspace_desc = pd.workspace_desc();
+    scratchpad_desc = pd.scratchpad_desc();
+    key = primitive.first;
+    p = primitive.second.primitive;
+    args = primitive.second.args;
+  } else if (mode == rnn_mode::gru) {
+    auto fpd = create_primitive_desc<::dnnl::gru_forward>(
+        fkind, direction, src_desc, iter_desc, weight_layer_desc,
+        weight_iter_desc, bias_desc, dst_desc, iter_desc, attr);
+    auto primitive = create_primitive_args_or_get<::dnnl::gru_backward>(
+        bkind, direction, src_desc, iter_desc, diff_weight_layer_desc,
+        diff_weight_iter_desc, bias_desc, dst_desc, iter_desc, src_desc,
+        iter_desc, weight_layer_desc, weight_iter_desc, bias_desc, dst_desc,
+        iter_desc, fpd, attr);
+    auto pd =
+        get_primitive_desc<::dnnl::gru_backward>(primitive.second.primitive);
+    workspace_desc = pd.workspace_desc();
+    scratchpad_desc = pd.scratchpad_desc();
+    key = primitive.first;
+    p = primitive.second.primitive;
+    args = primitive.second.args;
+  } else if (mode == rnn_mode::lstm) {
+    auto fpd = create_primitive_desc<::dnnl::lstm_forward>(
+        fkind, direction, src_desc, iter_desc, iter_c_desc, weight_layer_desc,
+        weight_iter_desc, ::dnnl::memory::desc(), projection_desc, bias_desc,
+        dst_desc, iter_desc, iter_c_desc, attr);
+    auto primitive = create_primitive_args_or_get<::dnnl::lstm_backward>(
+        bkind, direction, src_desc, iter_desc, iter_c_desc,
+        diff_weight_layer_desc, diff_weight_iter_desc, ::dnnl::memory::desc(),
+        diff_projection_desc, bias_desc, dst_desc, iter_desc, iter_c_desc,
+        src_desc, iter_desc, iter_c_desc, weight_layer_desc, weight_iter_desc,
+        ::dnnl::memory::desc(), projection_desc, bias_desc, dst_desc, iter_desc,
+        iter_c_desc, fpd, attr);
+    auto pd =
+        get_primitive_desc<::dnnl::lstm_backward>(primitive.second.primitive);
+    workspace_desc = pd.workspace_desc();
+    scratchpad_desc = pd.scratchpad_desc();
+    key = primitive.first;
+    p = primitive.second.primitive;
+    args = primitive.second.args;
+  }
+
+  for (int i = 0; i < iter_num; i++) {
+    insert_arg(args, DNNL_ARG_DIFF_SRC_LAYER, src_desc, data[8]);
+    insert_arg(args, DNNL_ARG_DIFF_DST_LAYER, dst_desc, data[9]);
+    insert_arg(args, DNNL_ARG_SCRATCHPAD, scratchpad_desc, data[15]);
+    auto insert_rnn_arg = [&](int arg_name, ::dnnl::memory::desc &d, void *data,
+                           int &offset) {
+      offset += d.get_size();
+      insert_arg(args, arg_name, d, (uint8_t *)data - offset);
+    };
+    if (mode == rnn_mode::lstm) {
+      insert_rnn_arg(DNNL_ARG_DST_ITER_C, iter_c_desc, data[7], offset[0]);
+      insert_rnn_arg(DNNL_ARG_SRC_ITER_C, iter_c_desc, data[4], offset[2]);
+    }
+    insert_rnn_arg(DNNL_ARG_DST_ITER, iter_desc, data[7], offset[0]);
+    insert_rnn_arg(DNNL_ARG_DST_LAYER, dst_desc, data[7], offset[0]);
+    insert_rnn_arg(DNNL_ARG_SRC_LAYER, src_desc, data[7], offset[0]);
+    insert_rnn_arg(DNNL_ARG_WORKSPACE, workspace_desc, data[7], offset[0]);
+    insert_rnn_arg(DNNL_ARG_SRC_ITER, iter_desc, data[2], offset[1]);
+    insert_rnn_arg(DNNL_ARG_BIAS, bias_desc, data[6], offset[3]);
+    if (projection_size) {
+      insert_rnn_arg(DNNL_ARG_WEIGHTS_PROJECTION, diff_projection_desc, data[6],
+                  offset[3]);
+    }
+    insert_rnn_arg(DNNL_ARG_WEIGHTS_ITER, diff_weight_iter_desc, data[6],
+                offset[3]);
+    insert_rnn_arg(DNNL_ARG_WEIGHTS_LAYER, diff_weight_layer_desc, data[6],
+                offset[3]);
+    insert_rnn_arg(DNNL_ARG_DIFF_SRC_ITER, iter_desc, data[10], offset[4]);
+    insert_rnn_arg(DNNL_ARG_DIFF_DST_ITER, iter_desc, data[11], offset[5]);
+    if (mode == rnn_mode::lstm) {
+      insert_rnn_arg(DNNL_ARG_DIFF_SRC_ITER_C, iter_c_desc, data[12], offset[6]);
+      insert_rnn_arg(DNNL_ARG_DIFF_DST_ITER_C, iter_c_desc, data[13], offset[7]);
+    }
+    insert_rnn_arg(DNNL_ARG_DIFF_BIAS, bias_desc, data[14], offset[8]);
+    if (bias_mode == rnn_bias_mode::none) {
+      _q->memset((uint8_t *)(data[14]) - offset[8], 0, bias_desc.get_size());
+    }
+    if (projection_size) {
+      insert_rnn_arg(DNNL_ARG_DIFF_WEIGHTS_PROJECTION, projection_desc, data[14],
+                  offset[8]);
+    }
+    insert_rnn_arg(DNNL_ARG_DIFF_WEIGHTS_ITER, weight_iter_desc, data[14],
+                offset[8]);
+    insert_rnn_arg(DNNL_ARG_DIFF_WEIGHTS_LAYER, weight_layer_desc, data[14],
+                offset[8]);
+    if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) {
+      e = execute_primitive<::dnnl::vanilla_rnn_backward>(
+          {key, {static_cast<::dnnl::vanilla_rnn_backward *>(p), args}});
+    } else if (mode == rnn_mode::gru) {
+      e = execute_primitive<::dnnl::gru_backward>(
+          {key, {static_cast<::dnnl::gru_backward *>(p), args}});
+    } else if (mode == rnn_mode::lstm) {
+      e = execute_primitive<::dnnl::lstm_backward>(
+          {key, {static_cast<::dnnl::lstm_backward *>(p), args}});
+    }
+    if (i != iter_num - 1) {
+      std::swap(data[8], data[9]);
+    }
+  }
+  return e;
+}
+
+#define EMPTY_CACHE_KEY(type)                                                  \
+  template <>                                                                  \
+  inline void engine_ext::generate_cache_key<type>(std::string & key_buffer,   \
+                                                   const type &arg) {}
+
+EMPTY_CACHE_KEY(::dnnl::engine)
+EMPTY_CACHE_KEY(::dnnl::convolution_forward::primitive_desc)
+EMPTY_CACHE_KEY(::dnnl::eltwise_forward::primitive_desc)
+EMPTY_CACHE_KEY(::dnnl::softmax_forward::primitive_desc)
+EMPTY_CACHE_KEY(::dnnl::pooling_forward::primitive_desc)
+EMPTY_CACHE_KEY(::dnnl::lrn_forward::primitive_desc)
+EMPTY_CACHE_KEY(::dnnl::batch_normalization_forward::primitive_desc)
+EMPTY_CACHE_KEY(::dnnl::vanilla_rnn_forward::primitive_desc)
+EMPTY_CACHE_KEY(::dnnl::lstm_forward::primitive_desc)
+EMPTY_CACHE_KEY(::dnnl::gru_forward::primitive_desc)
+#undef EMPTY_CACHE_KEY
+
+template <>
+inline void engine_ext::generate_cache_key<std::vector<float>>(
+    std::string &key_buffer, const std::vector<float> &vec) {
+  key_buffer.append((char *)vec.data(), vec.size() * sizeof(float));
+}
+
+template <>
+inline void engine_ext::generate_cache_key<::dnnl::primitive_attr>(
+    std::string &key_buffer, const ::dnnl::primitive_attr &attr) {
+  if (!attr) {
+    return;
+  }
+  auto math_mode = (uint8_t)attr.get_fpmath_mode();
+  key_buffer.append((char *)&math_mode, sizeof(uint8_t));
+}
+
+template <>
+inline void engine_ext::generate_cache_key<::dnnl::memory::dims>(
+    std::string &key_buffer, const ::dnnl::memory::dims &dims) {
+  key_buffer.append((char *)dims.data(), dims.size() * sizeof(int64_t));
+}
+
+template <>
+inline void engine_ext::generate_cache_key<::dnnl::memory::desc>(
+    std::string &key_buffer, const ::dnnl::memory::desc &desc) {
+  uint8_t params[3] = {(uint8_t)desc.get_format_kind(),
+                       (uint8_t)desc.get_ndims(),
+                       (uint8_t)desc.get_data_type()};
+  generate_cache_key(key_buffer, desc.get_inner_blks());
+  generate_cache_key(key_buffer, desc.get_dims());
+  generate_cache_key(key_buffer, desc.get_strides());
+}
+
+template <typename T>
+void engine_ext::generate_cache_key(std::string &key_buffer, const T &arg) {
+  key_buffer.append((char *)&arg, sizeof(T));
+}
+
+template <typename T, typename... args_type>
+void engine_ext::generate_cache_key(std::string &key_buffer, const T &first_arg,
+                                    const args_type &...args) {
+  generate_cache_key(key_buffer, first_arg);
+  generate_cache_key(key_buffer, args...);
+}
+
+template <typename primitive_type, typename... args_type>
+std::pair<detail::primitive_cache_key_type, detail::primitive_and_args>
+engine_ext::create_primitive_args_or_get(args_type &&...args) {
+  std::string buffer;
+  buffer.reserve(512);
+  generate_cache_key(buffer, std::forward<args_type>(args)...);
+  buffer.append(std::to_string(_engine_id));
+  auto value = _primitive_cache.get(buffer);
+  primitive_type *p = nullptr;
+  std::unordered_map<int, ::dnnl::memory> *a = nullptr;
+  if (value) {
+    p = (primitive_type *)value->_primitive;
+    a = value->_args;
+  } else {
+    p = new primitive_type(create_primitive_desc<primitive_type>(
+        std::forward<args_type>(args)...));
+    a = new std::unordered_map<int, ::dnnl::memory>();
+  }
+  return {buffer, {p, a}};
+}
+
+template <typename primitive_type>
+typename primitive_type::primitive_desc
+engine_ext::get_primitive_desc(::dnnl::primitive *p) {
+  return typename primitive_type::primitive_desc(
+      const_cast<dnnl_primitive_desc_t>(p->get_primitive_desc()));
+}
+
+template <typename primitive_type, typename... args_type>
+typename primitive_type::primitive_desc
+engine_ext::create_primitive_desc(args_type &&...args) {
+  return typename primitive_type::primitive_desc(
+      *_eng, std::forward<args_type>(args)...);
+}
+
+inline
+void engine_ext::fill(const memory_desc_ext &src_desc, void *src,
+                      const void *valuePtr) {
+  async_fill(src_desc, src, valuePtr).wait();
+}
+
+inline
+void engine_ext::reorder(float alpha, const memory_desc_ext &src_desc,
+                         void *src, float beta, const memory_desc_ext &dst_desc,
+                         void *dst) {
+  async_reorder(alpha, src_desc, src, beta, dst_desc, dst).wait();
+}
+
+inline
+void engine_ext::scale(float alpha, const memory_desc_ext &src_desc,
+                       void *src) {
+  async_scale(alpha, src_desc, src).wait();
+}
+inline
+void engine_ext::sum(float alpha, const memory_desc_ext &src_desc, void *src,
+                     float beta, const memory_desc_ext &dst_desc, void *dst) {
+  async_sum(alpha, src_desc, src, beta, dst_desc, dst).wait();
+}
+inline
+void engine_ext::activation_forward(activation_desc &desc, float alpha,
+                                    const memory_desc_ext &src_desc, void *src,
+                                    float beta, const memory_desc_ext &dst_desc,
+                                    void *dst) {
+  async_activation_forward(desc, alpha, src_desc, src, beta, dst_desc, dst)
+      .wait();
+}
+inline
+void engine_ext::activation_backward(
+    activation_desc &desc, float alpha, const memory_desc_ext &dst_desc,
+    void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
+    const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &diff_src_desc, void *diff_src) {
+  async_activation_backward(desc, alpha, dst_desc, dst, diff_dst_desc, diff_dst,
+                            src_desc, src, beta, diff_src_desc, diff_src)
+      .wait();
+}
+inline
+void engine_ext::pooling_forward(pooling_desc &desc, float alpha,
+                                 const memory_desc_ext &src_desc, void *src,
+                                 float beta, const memory_desc_ext &dst_desc,
+                                 void *dst,
+                                 ::dnnl::memory *workspace) {
+  async_pooling_forward(desc, alpha, src_desc, src, beta, dst_desc, dst,
+                        workspace).wait();
+}
+
+inline
+void engine_ext::pooling_backward(
+    pooling_desc &desc, float alpha, const memory_desc_ext &dst_desc, void *dst,
+    const memory_desc_ext &diff_dst_desc, void *diff_dst,
+    const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &diff_src_desc, void *diff_src,
+    ::dnnl::memory *workspace) {
+  async_pooling_backward(desc, alpha, dst_desc, dst, diff_dst_desc, diff_dst,
+                         src_desc, src, beta, diff_src_desc, diff_src,
+                         workspace)
+      .wait();
+}
+
+inline
+void engine_ext::softmax_forward(softmax_algorithm alg, softmax_mode mode,
+                                 float alpha, const memory_desc_ext &src_desc,
+                                 void *src, float beta,
+                                 const memory_desc_ext &dst_desc, void *dst) {
+  async_softmax_forward(alg, mode, alpha, src_desc, src, beta, dst_desc, dst)
+      .wait();
+}
+
+inline
+void engine_ext::softmax_backward(softmax_algorithm alg, softmax_mode mode,
+                                  float alpha, const memory_desc_ext &dst_desc,
+                                  void *dst,
+                                  const memory_desc_ext &diff_dst_desc,
+                                  void *diff_dst, float beta,
+                                  const memory_desc_ext &diff_src_desc,
+                                  void *diff_src) {
+  async_softmax_backward(alg, mode, alpha, dst_desc, dst, diff_dst_desc,
+                         diff_dst, beta, diff_src_desc, diff_src)
+      .wait();
+}
+
+inline
+void engine_ext::lrn_forward(lrn_desc &desc, float alpha,
+                             const memory_desc_ext &src_desc, void *src,
+                             float beta, const memory_desc_ext &dst_desc,
+                             void *dst, ::dnnl::memory *workspace) {
+  async_lrn_forward(desc, alpha, src_desc, src, beta, dst_desc, dst, workspace)
+      .wait();
+}
+
+inline
+void engine_ext::lrn_backward(lrn_desc &desc, float alpha,
+                              const memory_desc_ext &dst_desc, void *dst,
+                              const memory_desc_ext &diff_dst_desc,
+                              void *diff_dst, const memory_desc_ext &src_desc,
+                              void *src, float beta,
+                              const memory_desc_ext &diff_src_desc,
+                              void *diff_src,
+                              ::dnnl::memory *workspace) {
+  async_lrn_backward(desc, alpha, dst_desc, dst, diff_dst_desc, diff_dst,
+                     src_desc, src, beta, diff_src_desc, diff_src, workspace)
+      .wait();
+}
+
+inline
+sycl::event engine_ext::async_fill(const memory_desc_ext &src_desc, void *src,
+                             const void *valuePtr) {
+  ::dnnl::memory::data_type dt = src_desc.get_desc().get_data_type();
+  unsigned mem_size = src_desc.get_size();
+  switch (dt) {
+  case ::dnnl::memory::data_type::f32:
+    return fill_with_type<float>(_q, src, valuePtr, mem_size);
+  case ::dnnl::memory::data_type::f16:
+    return fill_with_type<sycl::half>(_q, src, valuePtr, mem_size);
+  case ::dnnl::memory::data_type::s32:
+    return fill_with_type<int32_t>(_q, src, valuePtr, mem_size);
+  case ::dnnl::memory::data_type::s8:
+    return fill_with_type<int8_t>(_q, src, valuePtr, mem_size);
+  case ::dnnl::memory::data_type::u8:
+    return fill_with_type<uint8_t>(_q, src, valuePtr, mem_size);
+  default:
+    throw std::runtime_error("async_fill: unsupported data type.");
+  }
+}
+
+inline
+sycl::event engine_ext::async_reorder(float alpha, const memory_desc_ext &src_desc,
+                                void *src, float beta,
+                                const memory_desc_ext &dst_desc, void *dst) {
+  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
+    return sycl::event();
+  }
+  enter_primitive(2 * dst_desc.get_size());
+
+  auto primitive_args = create_primitive_args_or_get<::dnnl::reorder>(
+      src_desc.get_desc(), *_eng, dst_desc.get_desc());
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
+             src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+
+  return exit_primitive(execute_primitive<::dnnl::reorder>(
+      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
+}
+
+inline
+sycl::event engine_ext::async_scale(float alpha, const memory_desc_ext &src_desc,
+                              void *src) {
+  if (alpha == 1.f) {
+    return sycl::event();
+  }
+  size_t cache_size = src_desc.get_size();
+  enter_primitive(cache_size);
+  void *src_cache = allocate(cache_size);
+  _q->memcpy(src_cache, src, cache_size);
+  auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_forward>(
+      ::dnnl::prop_kind::forward_inference, ::dnnl::algorithm::eltwise_linear,
+      src_desc.get_desc(), src_desc.get_desc(), alpha, 0.f);
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
+             src_cache);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, src_desc.get_desc(),
+             src);
+
+  return exit_primitive(
+      execute_primitive<::dnnl::eltwise_forward>(primitive_args));
+}
+
+inline sycl::event
+engine_ext::async_sum(float alpha, const memory_desc_ext &src_desc, void *src,
+                      float beta, const memory_desc_ext &dst_desc, void *dst) {
+  if (alpha == 0.f && beta == 1.f) {
+    return sycl::event();
+  }
+  size_t cache_size = dst_desc.get_size();
+  enter_primitive(cache_size);
+  void *dst_cache = allocate(dst_desc);
+  _q->memcpy(dst_cache, dst, cache_size);
+
+  auto primitive_args = create_primitive_args_or_get<::dnnl::sum>(
+      std::vector<float>{alpha, beta},
+      std::vector<::dnnl::memory::desc>{src_desc.get_desc(),
+                                        dst_desc.get_desc()});
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_MULTIPLE_SRC,
+             src_desc.get_desc(), src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_MULTIPLE_SRC + 1,
+             dst_desc.get_desc(), dst_cache);
+
+  return exit_primitive(execute_primitive<::dnnl::sum>(primitive_args));
+}
+
+inline
+sycl::event engine_ext::async_binary(binary_op op, float alpha_0,
+                               const memory_desc_ext &src_desc_0, void *src_0,
+                               float alpha_1, const memory_desc_ext &src_desc_1,
+                               void *src_1, float beta,
+                               const memory_desc_ext &dst_desc, void *dst) {
+  ::dnnl::algorithm onednn_algorithm;
+  switch (op) {
+  case binary_op::max:
+    onednn_algorithm = ::dnnl::algorithm::binary_max;
+    break;
+  case binary_op::min:
+    onednn_algorithm = ::dnnl::algorithm::binary_min;
+    break;
+  case binary_op::add:
+    onednn_algorithm = ::dnnl::algorithm::binary_add;
+    break;
+  case binary_op::sub:
+    onednn_algorithm = ::dnnl::algorithm::binary_sub;
+    break;
+  case binary_op::mul:
+    onednn_algorithm = ::dnnl::algorithm::binary_mul;
+    break;
+  case binary_op::div:
+    onednn_algorithm = ::dnnl::algorithm::binary_div;
+    break;
+  case binary_op::sqrt:
+    onednn_algorithm = ::dnnl::algorithm::eltwise_sqrt;
+    break;
+  case binary_op::neg:
+    onednn_algorithm = ::dnnl::algorithm::eltwise_linear;
+    break;
+  }
+  size_t src0_cache_size = src_desc_0.get_size();
+  size_t src1_cache_size = src_desc_1.get_size();
+  size_t dst_cache_size = dst_desc.get_size();
+  enter_primitive(2 * src0_cache_size + 2 * src1_cache_size +
+                  5 * dst_cache_size);
+  if (onednn_algorithm == ::dnnl::algorithm::eltwise_sqrt ||
+      onednn_algorithm == ::dnnl::algorithm::eltwise_linear) {
+    void *src_cache = nullptr, *dst_cache = nullptr;
+    src_cache = allocate(src0_cache_size);
+    dst_cache = allocate(dst_cache_size);
+    _q->memcpy(src_cache, src_0, src0_cache_size);
+    _q->memcpy(dst_cache, dst, dst_cache_size);
+    async_scale(alpha_0, src_desc_0, src_cache);
+    async_scale(beta, dst_desc, dst_cache);
+
+    // Let the output = 1 - input to simulate the behavior of neg.
+    auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_forward>(
+        ::dnnl::prop_kind::forward_inference, onednn_algorithm,
+        src_desc_0.get_desc(), dst_desc.get_desc(), -1.f, 1.f);
+
+    insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc_0.get_desc(),
+               src_cache);
+    insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+               dst);
+
+    execute_primitive<::dnnl::eltwise_forward>(
+        primitive_args, {{1.f, 0.f, DNNL_ARG_DST, dst_desc, dst}});
+    return exit_primitive(
+        async_sum(1.f, dst_desc, dst_cache, 1.f, dst_desc, dst));
+  }
+
+  void *src_0_cache = nullptr, *src_1_cache = nullptr, *dst_cache = nullptr;
+
+  src_0_cache = allocate(src0_cache_size);
+  src_1_cache = allocate(src1_cache_size);
+  dst_cache = allocate(dst_cache_size);
+
+  _q->memcpy(src_0_cache, src_0, src0_cache_size);
+  _q->memcpy(src_1_cache, src_1, src1_cache_size);
+  _q->memcpy(dst_cache, dst, dst_cache_size);
+
+  async_scale(alpha_0, src_desc_0, src_0_cache);
+  async_scale(alpha_1, src_desc_1, src_1_cache);
+  async_scale(beta, dst_desc, dst_cache);
+
+  auto primitive_args = create_primitive_args_or_get<::dnnl::binary>(
+      onednn_algorithm, src_desc_0.get_desc(), src_desc_1.get_desc(),
+      dst_desc.get_desc());
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_0, src_desc_0.get_desc(),
+             src_0_cache);
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_1, src_desc_1.get_desc(),
+             src_1_cache);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+
+  execute_primitive<::dnnl::binary>(primitive_args,
+                                    {{1.f, 0.f, DNNL_ARG_DST, dst_desc, dst}});
+  return exit_primitive(
+      async_sum(1.f, dst_desc, dst_cache, 1.f, dst_desc, dst));
+}
+
+inline
+sycl::event engine_ext::async_reduction(reduction_op op, float alpha,
+                                  const memory_desc_ext &src_desc, void *src,
+                                  float beta, const memory_desc_ext &dst_desc,
+                                  void *dst) {
+  if (alpha == 0.f && beta == 1.f) {
+    return sycl::event();
+  }
+  size_t src_cache_size = src_desc.get_size();
+  size_t dst_cache_size = dst_desc.get_size();
+  enter_primitive(3 * src_cache_size + 2 * dst_cache_size);
+  float p = 2.f;
+  ::dnnl::algorithm onednn_algorithm;
+  void *cache = nullptr;
+  switch (op) {
+  case reduction_op::amax:
+    cache = allocate(src_cache_size);
+    activation_desc adesc;
+    adesc.set_algorithm(::dnnl::algorithm::eltwise_abs);
+    async_activation_forward(adesc, 1.f, src_desc, src, 0.f, src_desc, cache);
+    onednn_algorithm = ::dnnl::algorithm::reduction_max;
+    src = cache;
+    break;
+  case reduction_op::max:
+    onednn_algorithm = ::dnnl::algorithm::reduction_max;
+    break;
+  case reduction_op::min:
+    onednn_algorithm = ::dnnl::algorithm::reduction_min;
+    break;
+  case reduction_op::sum:
+    onednn_algorithm = ::dnnl::algorithm::reduction_sum;
+    break;
+  case reduction_op::mean:
+    onednn_algorithm = ::dnnl::algorithm::reduction_mean;
+    break;
+  case reduction_op::mul:
+    onednn_algorithm = ::dnnl::algorithm::reduction_mul;
+    break;
+  case reduction_op::mul_no_zeros:
+    cache = allocate(src_cache_size);
+    transform_no_zero(src_desc, src, cache);
+    onednn_algorithm = ::dnnl::algorithm::reduction_mul;
+    src = cache;
+    break;
+  case reduction_op::norm1:
+    p = 1.f;
+    onednn_algorithm = ::dnnl::algorithm::reduction_norm_lp_power_p_sum;
+    break;
+  case reduction_op::norm2:
+    onednn_algorithm = ::dnnl::algorithm::reduction_norm_lp_sum;
+    break;
+  }
+  auto primitive_args = create_primitive_args_or_get<::dnnl::reduction>(
+      onednn_algorithm, src_desc.get_desc(), dst_desc.get_desc(), p, 0.f);
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
+             src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+
+  return exit_primitive(execute_primitive<::dnnl::reduction>(
+      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
+}
+
+inline
+sycl::event engine_ext::async_activation_forward(activation_desc &desc, float alpha,
+                                           const memory_desc_ext &src_desc,
+                                           void *src, float beta,
+                                           const memory_desc_ext &dst_desc,
+                                           void *dst) {
+  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
+    return sycl::event();
+  }
+  enter_primitive(2 * dst_desc.get_size());
+  auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_forward>(
+      ::dnnl::prop_kind::forward, desc.get_algorithm(), src_desc.get_desc(),
+      dst_desc.get_desc(), desc.get_alpha(), desc.get_beta());
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
+             src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+
+  return exit_primitive(execute_primitive<::dnnl::eltwise_forward>(
+      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
+}
+
+inline
+sycl::event engine_ext::async_activation_backward(
+    activation_desc &desc, float alpha, const memory_desc_ext &dst_desc,
+    void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
+    const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &diff_src_desc, void *diff_src) {
+
+  if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) {
+    return sycl::event();
+  }
+  enter_primitive(2 * diff_src_desc.get_size());
+  ::dnnl::memory::desc data_desc = dst_desc.get_desc();
+  auto alg = desc.get_algorithm();
+  if ((alg == ::dnnl::algorithm::eltwise_clip) ||
+      (alg == ::dnnl::algorithm::eltwise_linear) ||
+      (alg == ::dnnl::algorithm::eltwise_swish)) {
+    data_desc = src_desc.get_desc();
+  }
+  auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_backward>(
+      alg, diff_src_desc.get_desc(), diff_dst_desc.get_desc(), data_desc,
+      desc.get_alpha(), desc.get_beta(),
+      create_primitive_desc<::dnnl::eltwise_forward>(
+          ::dnnl::prop_kind::forward, alg, src_desc.get_desc(),
+          dst_desc.get_desc(), desc.get_alpha(), desc.get_beta()));
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
+             src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST,
+             diff_dst_desc.get_desc(), diff_dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC,
+             diff_src_desc.get_desc(), diff_src);
+
+  return exit_primitive(execute_primitive<::dnnl::eltwise_backward>(
+      primitive_args,
+      {{alpha, beta, DNNL_ARG_DIFF_SRC, diff_src_desc, diff_src}}));
+}
+
+inline
+sycl::event engine_ext::async_pooling_forward(pooling_desc &desc, float alpha,
+                                        const memory_desc_ext &src_desc,
+                                        void *src, float beta,
+                                        const memory_desc_ext &dst_desc,
+                                        void *dst, ::dnnl::memory *workspace) {
+  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
+    return sycl::event();
+  }
+  enter_primitive(2 * dst_desc.get_size());
+  int pooling_dim = desc.get_stride().size();
+  std::vector<int64_t> dilation(pooling_dim, 0);
+  auto primitive_args =
+      create_primitive_args_or_get<::dnnl::pooling_forward>(
+          ::dnnl::prop_kind::forward_training, desc.get_algorithm(),
+          src_desc.get_desc(), dst_desc.get_desc(), desc.get_stride(),
+          desc.get_kernel(), dilation, desc.get_padding(), desc.get_padding());
+  auto pd = get_primitive_desc<::dnnl::pooling_forward>(
+      primitive_args.second.primitive);
+  ::dnnl::memory ws_mem(pd.workspace_desc(), *_eng);
+  if (workspace) {
+    *workspace = ws_mem;
+  } else {
+    insert_workspace(src, ws_mem);
+  }
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
+             src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, ws_mem);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+
+  return exit_primitive(execute_primitive<::dnnl::pooling_forward>(
+      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
+}
+
+inline
+sycl::event engine_ext::async_pooling_backward(
+    pooling_desc &desc, float alpha, const memory_desc_ext &dst_desc, void *dst,
+    const memory_desc_ext &diff_dst_desc, void *diff_dst,
+    const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &diff_src_desc, void *diff_src,
+    ::dnnl::memory *workspace) {
+  if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) {
+    return sycl::event();
+  }
+  enter_primitive(2 * diff_src_desc.get_size());
+  int pooling_dim = desc.get_stride().size();
+  std::vector<int64_t> dilation(pooling_dim, 0);
+  auto primitive_args = create_primitive_args_or_get<::dnnl::pooling_backward>(
+      desc.get_algorithm(), diff_src_desc.get_desc(), diff_dst_desc.get_desc(),
+      desc.get_stride(), desc.get_kernel(), dilation, desc.get_padding(),
+      desc.get_padding(),
+      create_primitive_desc<::dnnl::pooling_forward>(
+          ::dnnl::prop_kind::forward_training, desc.get_algorithm(),
+          src_desc.get_desc(), dst_desc.get_desc(), desc.get_stride(),
+          desc.get_kernel(), dilation, desc.get_padding(), desc.get_padding()));
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
+             src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST,
+             diff_dst_desc.get_desc(), diff_dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC,
+             diff_src_desc.get_desc(), diff_src);
+
+  if (workspace) {
+    insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, *workspace);
+  } else {
+    insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE,
+               get_workspace(src));
+  }
+
+  return exit_primitive(execute_primitive<::dnnl::pooling_backward>(
+      primitive_args,
+      {{alpha, beta, DNNL_ARG_DIFF_SRC, diff_src_desc, diff_src}}));
+}
+
+inline
+sycl::event engine_ext::async_softmax_forward(softmax_algorithm alg,
+                                        softmax_mode mode, float alpha,
+                                        const memory_desc_ext &src_desc,
+                                        void *src, float beta,
+                                        const memory_desc_ext &dst_desc,
+                                        void *dst) {
+  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
+    return sycl::event();
+  }
+
+  ::dnnl::memory::desc help_src_desc = src_desc.get_desc();
+  ::dnnl::memory::desc help_dst_desc = dst_desc.get_desc();
+  if (mode == softmax_mode::instance) {
+    help_src_desc = compress_spatial_dimensions_to_channel(help_src_desc);
+    help_dst_desc = compress_spatial_dimensions_to_channel(help_dst_desc);
+  }
+  enter_primitive(2 * help_dst_desc.get_size());
+
+  ::dnnl::algorithm softmax_alg = ::dnnl::algorithm::softmax_accurate;
+  if (alg == softmax_algorithm::log) {
+    softmax_alg = ::dnnl::algorithm::softmax_log;
+  }
+  auto primitive_args = create_primitive_args_or_get<::dnnl::softmax_forward>(
+      ::dnnl::prop_kind::forward, softmax_alg, help_src_desc, 
+      help_dst_desc, 1);
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, help_dst_desc, dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, help_src_desc, src);
+
+  return exit_primitive(execute_primitive<::dnnl::softmax_forward>(
+      primitive_args,
+      {{alpha, beta, DNNL_ARG_DST, memory_desc_ext(help_dst_desc), dst}}));
+}
+
+inline
+sycl::event engine_ext::async_softmax_backward(
+    softmax_algorithm alg, softmax_mode mode, float alpha,
+    const memory_desc_ext &dst_desc, void *dst,
+    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
+    const memory_desc_ext &diff_src_desc, void *diff_src) {
+  if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) {
+    return sycl::event();
+  }
+  ::dnnl::memory::desc help_diff_src_desc = diff_src_desc.get_desc();
+  ::dnnl::memory::desc help_dst_desc = dst_desc.get_desc();
+  ::dnnl::memory::desc help_diff_dst_desc = diff_dst_desc.get_desc();
+  if (mode == softmax_mode::instance) {
+    help_diff_src_desc =
+        compress_spatial_dimensions_to_channel(help_diff_src_desc);
+    help_dst_desc = compress_spatial_dimensions_to_channel(help_dst_desc);
+    help_diff_dst_desc =
+        compress_spatial_dimensions_to_channel(help_diff_dst_desc);
+  }
+  enter_primitive(2 * help_diff_src_desc.get_size());
+
+  ::dnnl::algorithm softmax_alg = ::dnnl::algorithm::softmax_accurate;
+  if (alg == softmax_algorithm::log) {
+    softmax_alg = ::dnnl::algorithm::softmax_log;
+  }
+
+  auto primitive_args = create_primitive_args_or_get<::dnnl::softmax_backward>(
+      softmax_alg, help_diff_src_desc, help_diff_dst_desc, help_dst_desc, 1,
+      create_primitive_desc<::dnnl::softmax_forward>(
+          ::dnnl::prop_kind::forward, softmax_alg, help_diff_src_desc,
+          help_dst_desc, 1));
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, help_dst_desc, dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, help_diff_dst_desc,
+             diff_dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, help_diff_src_desc,
+             diff_src);
+
+  return exit_primitive(execute_primitive<::dnnl::softmax_backward>(
+      primitive_args, {{alpha, beta, DNNL_ARG_DIFF_SRC,
+                        memory_desc_ext(help_diff_src_desc), diff_src}}));
+}
+
+inline
+sycl::event engine_ext::async_lrn_forward(lrn_desc &desc, float alpha,
+                                    const memory_desc_ext &src_desc, void *src,
+                                    float beta, const memory_desc_ext &dst_desc,
+                                    void *dst, ::dnnl::memory *workspace) {
+
+  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
+    return sycl::event();
+  }
+  enter_primitive(2 * dst_desc.get_size());
+  auto primitive_args = create_primitive_args_or_get<::dnnl::lrn_forward>(
+      ::dnnl::prop_kind::forward_training,
+      ::dnnl::algorithm::lrn_across_channels, src_desc.get_desc(),
+      dst_desc.get_desc(), desc.get_local_size(), desc.get_alpha(),
+      desc.get_beta(), desc.get_k());
+  auto pd =
+      get_primitive_desc<::dnnl::lrn_forward>(primitive_args.second.primitive);
+  ::dnnl::memory ws_mem(pd.workspace_desc(), *_eng);
+  if (workspace) {
+    *workspace = ws_mem;
+  } else {
+    insert_workspace(src, ws_mem);
+  }
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
+             src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, ws_mem);
+
+  return exit_primitive(execute_primitive<::dnnl::lrn_forward>(
+      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
+}
+
+inline
+sycl::event
+engine_ext::async_lrn_backward(lrn_desc &desc, float alpha,
+                         const memory_desc_ext &dst_desc, void *dst,
+                         const memory_desc_ext &diff_dst_desc, void *diff_dst,
+                         const memory_desc_ext &src_desc, void *src, float beta,
+                         const memory_desc_ext &diff_src_desc, void *diff_src,
+                         ::dnnl::memory *workspace) {
+
+  if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) {
+    return sycl::event();
+  }
+  enter_primitive(2 * diff_src_desc.get_size());
+  auto primitive_args = create_primitive_args_or_get<::dnnl::lrn_backward>(
+      ::dnnl::algorithm::lrn_across_channels, diff_src_desc.get_desc(),
+      diff_dst_desc.get_desc(), src_desc.get_desc(), desc.get_local_size(),
+      desc.get_alpha(), desc.get_beta(), desc.get_k(),
+      create_primitive_desc<::dnnl::lrn_forward>(
+          ::dnnl::prop_kind::forward_training,
+          ::dnnl::algorithm::lrn_across_channels, src_desc.get_desc(),
+          dst_desc.get_desc(), desc.get_local_size(), desc.get_alpha(),
+          desc.get_beta(), desc.get_k()));
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
+             src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST,
+             diff_dst_desc.get_desc(), diff_dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC,
+             diff_src_desc.get_desc(), diff_src);
+
+  if (workspace) {
+    insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, *workspace);
+  } else {
+    insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE,
+               get_workspace(src));
+  }
+
+  return exit_primitive(execute_primitive<::dnnl::lrn_backward>(
+      primitive_args,
+      {{alpha, beta, DNNL_ARG_DIFF_SRC, diff_src_desc, diff_src}}));
+}
+
+inline
+size_t engine_ext::get_batch_normalization_workspace_size(
+    batch_normalization_ops ops, const memory_desc_ext &src_desc) {
+  if(ops == batch_normalization_ops::none) {
+    return 0;
+  }
+  return src_desc.get_size();
+}
+
+inline
+sycl::event engine_ext::async_batch_normalization_forward_inference(
+    batch_normalization_mode mode, float epsilon, float alpha,
+    const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &dst_desc, void *dst,
+    const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
+    void *mean, void *var) {
+
+  return batch_normalization_forward_internal(
+      true, mode, epsilon, 0.f, alpha, src_desc, src, beta, dst_desc, dst,
+      scale_bias_mean_var_desc, scale, bias, scale_bias_mean_var_desc, mean,
+      var, nullptr, nullptr);
+}
+
+inline
+sycl::event engine_ext::async_batch_normalization_forward_inference(
+    batch_normalization_mode mode, batch_normalization_ops ops,
+    activation_desc &adesc, float epsilon, float alpha,
+    const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &dst_desc, void *dst,
+    const memory_desc_ext &summand_desc, void *summand,
+    const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
+    const memory_desc_ext &mean_var_desc, void *mean, void *var) {
+
+  bool has_post_op = (ops != batch_normalization_ops::none);
+  sycl::event e;
+  enter_primitive(src_desc.get_size() + dst_desc.get_size() * 4 +
+                  scale_bias_desc.get_size() * 2 +
+                  mean_var_desc.get_size() * 5);
+  if (has_post_op) {
+    void *dst_cache = allocate(dst_desc);
+    batch_normalization_forward_internal(
+        true, mode, epsilon, 0.f, 1.f, src_desc, src, 0.f, dst_desc, dst_cache,
+        scale_bias_desc, scale, bias, mean_var_desc, mean, var, nullptr,
+        nullptr);
+
+    if (ops == batch_normalization_ops::add_activation) {
+      async_sum(1.f, summand_desc, summand, 1.f, dst_desc, dst_cache);
+    }
+    async_activation_forward(adesc, 1.f, dst_desc, dst_cache, 0.f, dst_desc,
+                       dst_cache);
+    return exit_primitive(
+        async_sum(alpha, dst_desc, dst_cache, beta, dst_desc, dst));
+  }
+  return exit_primitive(batch_normalization_forward_internal(
+      true, mode, epsilon, 0.f, alpha, src_desc, src, beta, dst_desc, dst,
+      scale_bias_desc, scale, bias, mean_var_desc, mean, var, nullptr,
+      nullptr));
+}
+
+inline
+sycl::event engine_ext::async_batch_normalization_forward_training(
+    batch_normalization_mode mode, float epsilon, float factor, float alpha,
+    const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &dst_desc, void *dst,
+    const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
+    void *running_mean, void *running_var, void *saved_mean, void *saved_var) {
+  return batch_normalization_forward_internal(
+      false, mode, epsilon, factor, alpha, src_desc, src, beta, dst_desc, dst,
+      scale_bias_mean_var_desc, scale, bias, scale_bias_mean_var_desc,
+      saved_mean, saved_var, running_mean, running_var);
+}
+
+inline
+sycl::event engine_ext::async_batch_normalization_forward_training(
+    batch_normalization_mode mode, batch_normalization_ops ops,
+    activation_desc &adesc, float epsilon, float factor, float alpha,
+    const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &dst_desc, void *dst,
+    const memory_desc_ext &summand_desc, void *summand,
+    const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
+    const memory_desc_ext &mean_var_desc, void *running_mean, void *running_var,
+    void *saved_mean, void *saved_var, size_t workspace_size,
+    void *workspace) {
+  enter_primitive(src_desc.get_size() + dst_desc.get_size() * 3 +
+                  mean_var_desc.get_size() * 5 +
+                  scale_bias_desc.get_size() * 2);
+  bool has_post_op = (ops != batch_normalization_ops::none);
+  sycl::event e;
+  if (has_post_op) {
+    if(workspace_size < dst_desc.get_desc().get_size()) {
+      throw std::runtime_error("async_batch_normalization_forward_training_ex: "
+        "no sufficient workspace.");
+    }
+    batch_normalization_forward_internal(
+        false, mode, epsilon, factor, 1.f, src_desc, src, 0.f, dst_desc,
+        workspace, scale_bias_desc, scale, bias, mean_var_desc,
+        saved_mean, saved_var, running_mean, running_var);
+    if (ops == batch_normalization_ops::add_activation) {
+      async_sum(1.f, summand_desc, summand, 1.f, dst_desc,
+          workspace);
+    }
+    return exit_primitive(async_activation_forward(
+        adesc, alpha, dst_desc, workspace, beta, dst_desc, dst));
+  }
+  return exit_primitive(batch_normalization_forward_internal(
+      false, mode, epsilon, factor, alpha, src_desc, src, beta, dst_desc, dst,
+      scale_bias_desc, scale, bias, mean_var_desc, saved_mean, saved_var,
+      running_mean, running_var));
+}
+
+inline
+sycl::event engine_ext::async_batch_normalization_forward_training(
+    batch_normalization_mode mode, batch_normalization_ops ops,
+    activation_desc &adesc, float epsilon, float factor, float alpha,
+    const memory_desc_ext &src_desc, void *src, float beta,
+    const memory_desc_ext &dst_desc, void *dst,
+    const memory_desc_ext &summand_desc, void *summand,
+    const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
+    void *running_mean, void *running_var, void *saved_mean, void *saved_var,
+    size_t workspace_size, void *workspace) {
+  return async_batch_normalization_forward_training(
+      mode, ops, adesc, epsilon, factor, alpha, src_desc, src, beta, dst_desc,
+      dst, summand_desc, summand, scale_bias_mean_var_desc, scale, bias,
+      scale_bias_mean_var_desc, running_mean, running_var, saved_mean,
+      saved_var, workspace_size, workspace);
+}
+
+inline
+sycl::event engine_ext::async_batch_normalization_backward(
+    batch_normalization_mode mode, float epsilon, float alpha_data,
+    const memory_desc_ext &src_desc, void *src,
+    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
+    const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param,
+    const memory_desc_ext &diff_scale_bias_mean_var_desc, void *scale,
+    float beta_param, void *diff_scale, void *diff_bias, void *saved_mean,
+    void *saved_var) {
+
+  return batch_normalization_backward_internal(
+      mode, epsilon, alpha_data, src_desc, src, diff_dst_desc, diff_dst,
+      beta_data, diff_src_desc, diff_src, alpha_param,
+      diff_scale_bias_mean_var_desc, scale, nullptr, beta_param, diff_scale,
+      diff_bias, diff_scale_bias_mean_var_desc, saved_mean, saved_var);
+}
+
+inline
+sycl::event engine_ext::async_batch_normalization_backward(
+    batch_normalization_mode mode, batch_normalization_ops ops,
+    activation_desc &adesc, float epsilon, float alpha_data,
+    const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
+    void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
+    float beta_data, const memory_desc_ext &diff_src_desc, void *diff_src,
+    const memory_desc_ext &diff_summand_desc, void *diff_summand,
+    float alpha_param, const memory_desc_ext &diff_scale_bias_desc, void *scale,
+    void *bias, float beta_param, void *diff_scale, void *diff_bias,
+    const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var,
+    size_t workspace_size, void *workspace) {
+  std::vector<void *> caches;
+  ::dnnl::memory::desc real_diff_dst_desc = diff_dst_desc.get_desc();
+  void *real_diff_dst = diff_dst;
+
+  if (ops != batch_normalization_ops::none &&
+      workspace_size < dst_desc.get_desc().get_size()) {
+    throw std::runtime_error("async_batch_normalization_backward_ex: "
+                             "no sufficient workspace.");
+  }
+  enter_primitive(diff_scale_bias_desc.get_size() * 8 +
+                  src_desc.get_size() * 3 + diff_dst_desc.get_size() * 5 +
+                  diff_src_desc.get_size() + mean_var_desc.get_size() * 9 +
+                  diff_summand_desc.get_size());
+  if (ops == batch_normalization_ops::add_activation) {
+    void *diff_summand_cache = allocate(diff_summand_desc);
+    async_activation_backward(adesc, 1.f, dst_desc, dst, diff_dst_desc, diff_dst,
+                        dst_desc, workspace, 0.f,
+                        diff_summand_desc, diff_summand_cache);
+    async_sum(alpha_data, diff_summand_desc, diff_summand_cache, beta_data,
+        diff_summand_desc, diff_summand);
+    real_diff_dst_desc = diff_summand_desc.get_desc();
+    real_diff_dst = diff_summand_cache;
+  } else if (ops == batch_normalization_ops::activation) {
+    void *diff_dst_cache = allocate(diff_dst_desc);
+    async_activation_backward(adesc, 1.f, dst_desc, dst, diff_dst_desc,
+                        diff_dst, dst_desc, workspace,
+                        0.f, diff_dst_desc, diff_dst_cache);
+    real_diff_dst = diff_dst_cache;
+  }
+
+  return exit_primitive(batch_normalization_backward_internal(
+      mode, epsilon, alpha_data, src_desc, src, real_diff_dst_desc,
+      real_diff_dst, beta_data, diff_src_desc, diff_src, alpha_param,
+      diff_scale_bias_desc, scale, bias, beta_param, diff_scale, diff_bias,
+      mean_var_desc, saved_mean, saved_var));
+}
+
+inline
+sycl::event engine_ext::async_batch_normalization_backward(
+    batch_normalization_mode mode, batch_normalization_ops ops,
+    activation_desc &adesc, float epsilon, float alpha_data,
+    const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
+    void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
+    float beta_data, const memory_desc_ext &diff_src_desc, void *diff_src,
+    const memory_desc_ext &diff_summand_desc, void *diff_summand,
+    float alpha_param, const memory_desc_ext &diff_scale_bias_mean_var_desc,
+    void *scale, void *bias, float beta_param, void *diff_scale,
+    void *diff_bias, void *saved_mean, void *saved_var,
+    size_t workspace_size, void *workspace) {
+
+  return async_batch_normalization_backward(
+      mode, ops, adesc, epsilon, alpha_data, src_desc, src, dst_desc, dst,
+      diff_dst_desc, diff_dst, beta_data, diff_src_desc, diff_src,
+      diff_summand_desc, diff_summand, alpha_param,
+      diff_scale_bias_mean_var_desc, scale, bias, beta_param, diff_scale,
+      diff_bias, diff_scale_bias_mean_var_desc, saved_mean, saved_var,
+      workspace_size, workspace);
+}
+
+inline
+sycl::event
+engine_ext::async_convolution_forward(convolution_desc &desc, ::dnnl::algorithm alg,
+                                float alpha, const memory_desc_ext &src_desc,
+                                void *src, const memory_desc_ext &weight_desc,
+                                void *weight, float beta,
+                                const memory_desc_ext &dst_desc, void *dst) {
+  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
+    return sycl::event();
+  }
+  auto help_weight_desc =
+      get_group_weight_desc(desc.get_group_count(), weight_desc);
+
+  ::dnnl::primitive_attr attr;
+  attr.set_fpmath_mode(desc.get_math_mode());
+
+  auto origin_src_md = src_desc.get_desc();
+  auto origin_dst_md = dst_desc.get_desc();
+  auto origin_weight_md = help_weight_desc;
+  auto src_md = transfer_memory_desc_to_format_tag_any(origin_src_md);
+  auto dst_md = transfer_memory_desc_to_format_tag_any(origin_dst_md);
+  auto weight_md = transfer_memory_desc_to_format_tag_any(origin_weight_md);
+
+  auto primitive_args =
+      create_primitive_args_or_get<::dnnl::convolution_forward>(
+          ::dnnl::prop_kind::forward_training, alg, src_md, weight_md, dst_md,
+          desc.get_stride(), desc.get_dilate(), desc.get_padding(),
+          desc.get_padding(), attr);
+
+  auto pd = get_primitive_desc<::dnnl::convolution_forward>(
+      primitive_args.second.primitive);
+  auto optimal_src_md = pd.src_desc();
+  auto optimal_dst_md = pd.dst_desc();
+  auto optimal_weight_md = pd.weights_desc();
+
+  enter_primitive(
+      optimal_src_md.get_size() * 3 + optimal_dst_md.get_size() * 5 +
+      optimal_weight_md.get_size() * 3 + origin_dst_md.get_size() * 2);
+
+  void *optimal_src = src, *optimal_dst = dst, *optimal_weight = weight;
+  allocate_and_reorder_memory_to_optimal(origin_src_md, src, optimal_src_md,
+                                         optimal_src);
+  allocate_and_reorder_memory_to_optimal(origin_weight_md, weight,
+                                         optimal_weight_md, optimal_weight);
+
+  if (beta == 0.f) {
+    if(origin_dst_md != optimal_dst_md) {
+      optimal_dst = allocate(optimal_dst_md);
+    }
+  } else {
+    allocate_and_reorder_memory_to_optimal(origin_dst_md, dst, optimal_dst_md,
+                                           optimal_dst);
+  }
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, optimal_src_md,
+             optimal_src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md,
+             optimal_weight);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, optimal_dst_md,
+             optimal_dst);
+
+  auto e = execute_primitive<::dnnl::convolution_forward>(
+      primitive_args,
+      {{alpha, beta, DNNL_ARG_DST, optimal_dst_md, optimal_dst}});
+
+  if (origin_dst_md != optimal_dst_md) {
+    e = async_reorder(1.f, optimal_dst_md, optimal_dst, 0.f, origin_dst_md,
+                      dst);
+  }
+  return exit_primitive(e);
+}
+
+inline
+sycl::event engine_ext::async_convolution_forward(
+    convolution_desc &desc, ::dnnl::algorithm alg, activation_desc &adesc,
+    float alpha_0, const memory_desc_ext &src_desc, void *src,
+    const memory_desc_ext &weight_desc, void *weight, float alpha_1,
+    const memory_desc_ext &summand_desc, void *summand,
+    const memory_desc_ext &bias_desc, void *bias,
+    const memory_desc_ext &dst_desc, void *dst) {
+
+  int channel_num = bias_desc.get_element_num();
+  auto help_weight_desc =
+      get_group_weight_desc(desc.get_group_count(), weight_desc);
+  ::dnnl::memory::desc help_bias_desc = {{channel_num},
+                                         bias_desc.get_desc().get_data_type(),
+                                         ::dnnl::memory::format_tag::a};
+  auto origin_weight_md = help_weight_desc;
+  auto origin_bias_md = help_bias_desc;
+  auto origin_src_md = src_desc.get_desc();
+  auto origin_dst_md = dst_desc.get_desc();
+  auto src_md = transfer_memory_desc_to_format_tag_any(origin_src_md);
+  auto dst_md = transfer_memory_desc_to_format_tag_any(origin_dst_md);
+  auto weight_md = transfer_memory_desc_to_format_tag_any(origin_weight_md);
+  auto bias_md = transfer_memory_desc_to_format_tag_any(origin_bias_md);
+
+  ::dnnl::primitive_attr attr;
+  attr.set_fpmath_mode(desc.get_math_mode());
+
+  auto primitive_args =
+      create_primitive_args_or_get<::dnnl::convolution_forward>(
+          ::dnnl::prop_kind::forward_training, alg, src_md, weight_md, bias_md,
+          dst_md, desc.get_stride(), desc.get_dilate(), desc.get_padding(),
+          desc.get_padding(), attr);
+
+  auto pd = get_primitive_desc<::dnnl::convolution_forward>(
+      primitive_args.second.primitive);
+  auto optimal_src_md = pd.src_desc();
+  auto optimal_dst_md = pd.dst_desc();
+  auto optimal_weight_md = pd.weights_desc();
+  auto optimal_bias_md = pd.bias_desc();
+
+  enter_primitive(optimal_src_md.get_size() + 3 * optimal_weight_md.get_size() +
+                  optimal_bias_md.get_size() + 7 * optimal_dst_md.get_size() +
+                  summand_desc.get_size());
+
+  void *optimal_src = src, *optimal_dst = dst, *optimal_weight = weight,
+       *optimal_bias = bias;
+  allocate_and_reorder_memory_to_optimal(origin_src_md, src, optimal_src_md,
+                                         optimal_src);
+  allocate_and_reorder_memory_to_optimal(origin_weight_md, weight,
+                                         optimal_weight_md, optimal_weight);
+  allocate_and_reorder_memory_to_optimal(origin_bias_md, bias, optimal_bias_md,
+                                         optimal_bias);
+  if (origin_dst_md != optimal_dst_md) {
+    optimal_dst = allocate(optimal_dst_md);
+  }
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, optimal_src_md,
+             optimal_src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_BIAS, optimal_bias_md,
+             optimal_bias);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, optimal_dst_md,
+             optimal_dst);
+
+  void *cache = nullptr;
+  if (alpha_0 != 1.f) {
+    cache = allocate(optimal_weight_md);
+    _q->memcpy(cache, optimal_weight, optimal_weight_md.get_size());
+    async_scale(alpha_0, optimal_weight_md, cache);
+    insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md,
+               cache);
+    execute_primitive<::dnnl::convolution_forward>(
+        primitive_args,
+        {{1.f, 0.f, DNNL_ARG_DST, optimal_dst_md, optimal_dst}});
+  } else {
+    insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md,
+               optimal_weight);
+    execute_primitive<::dnnl::convolution_forward>(
+        primitive_args,
+        {{1.f, 0.f, DNNL_ARG_DST, optimal_dst_md, optimal_dst}});
+  }
+  if (origin_dst_md != optimal_dst_md) {
+    async_reorder(1.f, optimal_dst_md, optimal_dst, 0.f, origin_dst_md, dst);
+  }
+  async_sum(alpha_1, summand_desc, summand, 1.f, dst_desc, dst);
+  return exit_primitive(
+      async_activation_forward(adesc, 1.f, dst_desc, dst, 0.f, dst_desc, dst));
+}
+
+inline
+sycl::event engine_ext::async_convolution_backward_data(
+    convolution_desc &desc, ::dnnl::algorithm alg, float alpha,
+    const memory_desc_ext &weight_desc, void *weight,
+    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
+    const memory_desc_ext &diff_src_desc, void *diff_src) {
+
+  if (scale_parameter_preprocess({{alpha, beta, diff_dst_desc, diff_dst}})) {
+    return sycl::event();
+  }
+
+  auto help_weight_desc =
+      get_group_weight_desc(desc.get_group_count(), weight_desc);
+
+  auto origin_weight_md = help_weight_desc;
+  auto origin_diff_src_md = diff_src_desc.get_desc();
+  auto origin_diff_dst_md = diff_dst_desc.get_desc();
+  auto diff_src_md = transfer_memory_desc_to_format_tag_any(origin_diff_src_md);
+  auto diff_dst_md = transfer_memory_desc_to_format_tag_any(origin_diff_dst_md);
+  auto weight_md = transfer_memory_desc_to_format_tag_any(origin_weight_md);
+
+  ::dnnl::primitive_attr attr;
+  attr.set_fpmath_mode(desc.get_math_mode());
+
+  auto forward_primitive = create_primitive_desc<::dnnl::convolution_forward>(
+      ::dnnl::prop_kind::forward_training, ::dnnl::algorithm::convolution_auto,
+      diff_src_md, weight_md, diff_dst_md, desc.get_stride(), desc.get_dilate(),
+      desc.get_padding(), desc.get_padding(), attr);
+
+  auto primitive_args =
+      create_primitive_args_or_get<::dnnl::convolution_backward_data>(
+          ::dnnl::algorithm::convolution_auto, diff_src_md, weight_md,
+          diff_dst_md, desc.get_stride(), desc.get_dilate(), desc.get_padding(),
+          desc.get_padding(), forward_primitive, attr);
+
+  auto pd = get_primitive_desc<::dnnl::convolution_backward_data>(
+      primitive_args.second.primitive);
+  auto optimal_diff_src_md = pd.diff_src_desc();
+  auto optimal_diff_dst_md = pd.diff_dst_desc();
+  auto optimal_weight_md = pd.weights_desc();
+
+  enter_primitive(5 * optimal_diff_src_md.get_size() +
+                  optimal_diff_dst_md.get_size() +
+                  optimal_weight_md.get_size());
+
+  void *optimal_diff_src = diff_src, *optimal_diff_dst = diff_dst,
+       *optimal_weight = weight;
+  allocate_and_reorder_memory_to_optimal(origin_diff_dst_md, diff_dst,
+                                         optimal_diff_dst_md, optimal_diff_dst);
+  allocate_and_reorder_memory_to_optimal(origin_weight_md, weight,
+                                         optimal_weight_md, optimal_weight);
+  if (beta == 0.f) {
+    if (origin_diff_src_md != optimal_diff_src_md) {
+      optimal_diff_src = allocate(optimal_diff_src_md);
+    }
+  } else {
+    allocate_and_reorder_memory_to_optimal(
+        origin_diff_src_md, diff_src, optimal_diff_src_md, optimal_diff_src);
+  }
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, optimal_diff_dst_md,
+             optimal_diff_dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md,
+             optimal_weight);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, optimal_diff_src_md,
+             optimal_diff_src);
+
+  auto e = execute_primitive<::dnnl::convolution_backward_data>(
+      primitive_args,
+      {{alpha, beta, DNNL_ARG_DIFF_SRC, optimal_diff_src_md, optimal_diff_src}});
+
+  if (origin_diff_src_md != optimal_diff_src_md) {
+    e = async_reorder(1.f, optimal_diff_src_md, optimal_diff_src, 0.f,
+                      origin_diff_src_md, diff_src);
+  }
+  return exit_primitive(e);
+}
+
+inline
+sycl::event engine_ext::async_convolution_backward_weight(
+    convolution_desc &desc, ::dnnl::algorithm alg, float alpha,
+    const memory_desc_ext &src_desc, void *src,
+    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
+    const memory_desc_ext &diff_weight_desc, void *diff_weight) {
+
+  if (scale_parameter_preprocess(
+          {{alpha, beta, diff_weight_desc, diff_weight}})) {
+    return sycl::event();
+  }
+
+  auto help_diff_weight_desc =
+      get_group_weight_desc(desc.get_group_count(), diff_weight_desc);
+
+  ::dnnl::primitive_attr attr;
+  attr.set_fpmath_mode(desc.get_math_mode());
+
+  auto origin_diff_weight_md = help_diff_weight_desc;
+  auto origin_src_md = src_desc.get_desc();
+  auto origin_diff_dst_md = diff_dst_desc.get_desc();
+  auto src_md = transfer_memory_desc_to_format_tag_any(origin_src_md);
+  auto diff_dst_md = transfer_memory_desc_to_format_tag_any(origin_diff_dst_md);
+  auto diff_weight_md =
+      transfer_memory_desc_to_format_tag_any(origin_diff_weight_md);
+
+  auto forward_primitive = create_primitive_desc<::dnnl::convolution_forward>(
+      ::dnnl::prop_kind::forward_training, ::dnnl::algorithm::convolution_auto,
+      src_md, diff_weight_md, diff_dst_md, desc.get_stride(), desc.get_dilate(),
+      desc.get_padding(), desc.get_padding(), attr);
+
+  auto primitive_args =
+      create_primitive_args_or_get<::dnnl::convolution_backward_weights>(
+          ::dnnl::algorithm::convolution_auto, src_md, diff_weight_md,
+          diff_dst_md, desc.get_stride(), desc.get_dilate(), desc.get_padding(),
+          desc.get_padding(), forward_primitive, attr);
+
+  auto pd = get_primitive_desc<::dnnl::convolution_backward_weights>(
+      primitive_args.second.primitive);
+  auto optimal_src_md = pd.src_desc();
+  auto optimal_diff_dst_md = pd.diff_dst_desc();
+  auto optimal_diff_weight_md = pd.diff_weights_desc();
+
+  enter_primitive(optimal_diff_weight_md.get_size() * 5 +
+                  optimal_diff_dst_md.get_size() + optimal_src_md.get_size());
+
+  void *optimal_src = src, *optimal_diff_dst = diff_dst,
+       *optimal_diff_weight = diff_weight;
+  allocate_and_reorder_memory_to_optimal(origin_diff_dst_md, diff_dst,
+                                         optimal_diff_dst_md, optimal_diff_dst);
+  allocate_and_reorder_memory_to_optimal(origin_src_md, src, optimal_src_md,
+                                         optimal_src);
+  if (beta == 0.f) {
+    if (origin_diff_weight_md != optimal_diff_weight_md) {
+      optimal_diff_weight = allocate(optimal_diff_weight_md);
+    }
+  } else {
+    allocate_and_reorder_memory_to_optimal(origin_diff_weight_md, diff_weight,
+                                           optimal_diff_weight_md,
+                                           optimal_diff_weight);
+  }
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, optimal_src_md,
+             optimal_src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, optimal_diff_dst_md,
+             optimal_diff_dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_WEIGHTS,
+             optimal_diff_weight_md, optimal_diff_weight);
+
+  auto e = execute_primitive<::dnnl::convolution_backward_weights>(
+      primitive_args, {{alpha, beta, DNNL_ARG_DIFF_WEIGHTS,
+                        optimal_diff_weight_md, optimal_diff_weight}});
+
+  if (origin_diff_weight_md != optimal_diff_weight_md) {
+    e = async_reorder(1.f, optimal_diff_weight_md, optimal_diff_weight, 0.f,
+                      origin_diff_weight_md, diff_weight);
+  }
+  return exit_primitive(e);
+}
+
+inline
+sycl::event engine_ext::async_convolution_backward_bias(
+    float alpha, const memory_desc_ext &diff_dst_desc, void *diff_dst,
+    float beta, const memory_desc_ext &diff_bias_desc, void *diff_bias) {
+  return async_reduction(reduction_op::sum, alpha, diff_dst_desc, diff_dst, beta,
+                   diff_bias_desc, diff_bias);
+}
+
+inline
+void engine_ext::rnn_get_weight_space_size(const rnn_desc &desc,
+                                           size_t *weight_space_size) {
+  *weight_space_size = 0;
+  rnn_forward_internal(desc, ::dnnl::prop_kind::forward_inference,
+                       memory_desc_ext(), nullptr, memory_desc_ext(), nullptr,
+                       memory_desc_ext(), nullptr, nullptr, memory_desc_ext(),
+                       nullptr, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, true,
+                       weight_space_size, nullptr, nullptr);
+  return;
+}
+
+inline
+void engine_ext::rnn_get_scratchpad_workspace_size(
+    const rnn_desc &desc, ::dnnl::prop_kind kind,
+    const memory_desc_ext &src_desc, size_t *scratchpad_size,
+    size_t *workspace_size) {
+  *workspace_size = 0;
+  *scratchpad_size = 0;
+  rnn_forward_internal(desc, kind, src_desc, nullptr, memory_desc_ext(),
+                       nullptr, memory_desc_ext(), nullptr, nullptr,
+                       memory_desc_ext(), nullptr, nullptr, 0, nullptr, 0,
+                       nullptr, 0, nullptr, true, nullptr, workspace_size,
+                       scratchpad_size);
+  return;
+}
+
+inline
+sycl::event engine_ext::async_rnn_forward(
+    const rnn_desc &desc, ::dnnl::prop_kind kind,
+    const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
+    void *dst, const memory_desc_ext &iter_desc, void *src_iter, void *dst_iter,
+    const memory_desc_ext &iter_c_desc, void *src_iter_c, void *dst_iter_c,
+    size_t weight_size, void *weight, size_t scratchpad_size, void *scratchpad,
+    size_t workspace_size, void *workspace) {
+
+  return rnn_forward_internal(
+      desc, kind, src_desc, src, dst_desc, dst, iter_desc, src_iter, dst_iter,
+      iter_c_desc, src_iter_c, dst_iter_c, weight_size, weight, workspace_size,
+      workspace, scratchpad_size, scratchpad, false, nullptr, nullptr,
+      nullptr);
+}
+
+inline
+sycl::event engine_ext::async_rnn_backward(
+    const rnn_desc &desc, const memory_desc_ext &dst_desc, void *dst,
+    void *diff_dst, const memory_desc_ext &src_desc, void *src, void *diff_src,
+    const memory_desc_ext &iter_desc, void *src_iter, void *diff_dst_iter,
+    void *diff_src_iter, const memory_desc_ext &iter_c_desc, void *src_iter_c,
+    void *diff_dst_iter_c, void *diff_src_iter_c, size_t weight_size,
+    void *weight, void *diff_weight, size_t scratchpad_size, void *scratchpad,
+    size_t workspace_size, void *workspace) {
+  ::dnnl::memory::data_type src_dt;
+  ::dnnl::memory::format_tag src_format_tag;
+  rnn_mode mode;
+  rnn_memory_format_tag format_tag;
+  rnn_bias_mode bias_mode;
+  rnn_direction direction;
+  dpct::library_data_t dt;
+  int direction_num = 1, input_size = 0, hidden_size = 0, projection_size = 0,
+      layer_size = 0, gate_num = 1, output_size = 0, data_type_size = 0,
+      seq_length = 1, batch_size = 1;
+  void *last_layer_cache = nullptr;
+  void *hidden_layer_cache = nullptr;
+  sycl::event e;
+  enter_primitive(src_desc.get_size() * 2);
+  std::vector<int> offset(9, 0);
+  std::vector<void *> data = {
+      src,
+      dst,
+      (uint8_t *)src_iter + iter_desc.get_size(),
+      nullptr,
+      (uint8_t *)src_iter_c + iter_c_desc.get_size(),
+      nullptr,
+      (uint8_t *)weight + weight_size,
+      (uint8_t *)workspace + workspace_size,
+      diff_src,
+      diff_dst,
+      (uint8_t *)diff_src_iter + iter_desc.get_size(),
+      (uint8_t *)diff_dst_iter + iter_desc.get_size(),
+      (uint8_t *)diff_src_iter_c + iter_c_desc.get_size(),
+      (uint8_t *)diff_dst_iter_c + iter_c_desc.get_size(),
+      (uint8_t *)diff_weight + weight_size,
+      scratchpad};
+
+  desc.get(&mode, &bias_mode, &direction, &dt, &input_size, &hidden_size,
+           &projection_size, &layer_size);
+
+  get_rnn_configuration(src_desc.get_desc(), direction, mode, dt, hidden_size,
+                        &src_dt, &src_format_tag, &projection_size,
+                        &output_size, &seq_length, &batch_size, &direction_num,
+                        &gate_num);
+
+  if (direction == rnn_direction::bidirectional) {
+    if (layer_size > 1) {
+      last_layer_cache = allocate(src_desc);
+      hidden_layer_cache = allocate(src_desc);
+      data[8] = last_layer_cache;
+    }
+    e = execute_rnn_backward_primitive(
+        mode, ::dnnl::rnn_direction::bidirectional_concat, bias_mode, src_dt,
+        src_format_tag, seq_length, batch_size, output_size, 2 * output_size, 1,
+        direction_num, hidden_size, gate_num, projection_size, data, offset, 1);
+    if (layer_size > 1) {
+      data[8] = hidden_layer_cache;
+      data[9] = last_layer_cache;
+      e = execute_rnn_backward_primitive(
+          mode, ::dnnl::rnn_direction::bidirectional_sum, bias_mode, src_dt,
+          src_format_tag, seq_length, batch_size, output_size, output_size, 1,
+          direction_num, hidden_size, gate_num, projection_size, data, offset,
+          layer_size - 1);
+      _q->memcpy(diff_src,
+                 ((layer_size - 1) % 2 == 0) ? last_layer_cache
+                                             : hidden_layer_cache,
+                 src_desc.get_size());
+    }
+  } else {
+    e = execute_rnn_backward_primitive(
+        mode, ::dnnl::rnn_direction::unidirectional_left2right, bias_mode,
+        src_dt, src_format_tag, seq_length, batch_size, output_size,
+        output_size, layer_size, direction_num, hidden_size, gate_num,
+        projection_size, data, offset, 1);
+  }
+
+  return exit_primitive(e);
+}
+
+inline
+size_t engine_ext::get_dropout_state_size(){
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
+                           "Interfaces Project does not support this API.");
+#else
+  auto r = get_internal_resource(_q);
+  if(r->random_engine_state_size == -1){
+    auto rand_engine = rng_engine_t(*_q, 0);
+    r->random_engine_state_size =
+        oneapi::mkl::rng::get_state_size(rand_engine);
+  }
+  return r->random_engine_state_size;
+#endif
+}
+
+inline size_t
+engine_ext::get_dropout_workspace_size(const memory_desc_ext &src_desc) {
+  return src_desc.get_size();
+}
+
+inline
+sycl::event engine_ext::async_dropout_forward(dropout_desc &desc,
+                                              const memory_desc_ext &src_desc,
+                                              void *src,
+                                              const memory_desc_ext &dst_desc,
+                                              void *dst, void *workspace,
+                                              size_t workspace_size) {
+  if (workspace_size < src_desc.get_size()) {
+    throw std::runtime_error("async_dropout_forward: no sufficient workspace.");
+  }
+  enter_primitive(src_desc.get_size() * 2 + dst_desc.get_size() * 2);
+  float p = desc.get_probability();
+  if (p == 1.f) {
+    return _q->memset(dst, 0, dst_desc.get_size());
+  } else if (p == 0.f) {
+    return async_reorder(1.f, src_desc, src, 0.f, dst_desc, dst);
+  }
+
+  float scale_factor = 1.f / (1.f - p);
+  void *cache = workspace;
+
+  memory_desc_ext rng_data_desc(
+      ::dnnl::memory::desc(src_desc.get_dims(), ::dnnl::memory::data_type::s32,
+                           src_desc.get_strides()));
+  if (src_desc.get_desc().get_data_type() != ::dnnl::memory::data_type::s32) {
+    cache = allocate(rng_data_desc);
+  }
+
+  desc.generate(_q, get_dropout_state_size(), rng_data_desc.get_element_num(),
+                (std::int32_t *)cache);
+
+  if (cache == workspace) {
+    async_scale(scale_factor, src_desc, workspace);
+  } else {
+    async_reorder(scale_factor, rng_data_desc, cache, 0.f, src_desc, workspace);
+  }
+
+  auto primitive_args = create_primitive_args_or_get<::dnnl::binary>(
+      ::dnnl::algorithm::binary_mul, src_desc.get_desc(), src_desc.get_desc(),
+      dst_desc.get_desc());
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_0, src_desc.get_desc(),
+             src);
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_1, src_desc.get_desc(),
+             workspace);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
+             dst);
+
+  return exit_primitive(execute_primitive<::dnnl::binary>(primitive_args));
+}
+
+inline
+sycl::event engine_ext::async_dropout_backward(
+    dropout_desc &desc, const memory_desc_ext &diff_dst_desc,
+    void *diff_dst, const memory_desc_ext &diff_src_desc, void *diff_src,
+    void *workspace, size_t workspace_size) {
+  enter_primitive(2 * diff_src_desc.get_size());
+  float p = desc.get_probability();
+  if (p == 1.f) {
+    return _q->memset(diff_src, 0, diff_src_desc.get_size());
+  } else if (p == 0.f) {
+    return async_reorder(1.f, diff_dst_desc, diff_dst, 0.f, diff_src_desc,
+                         diff_src);
+  }
+
+  auto primitive_args = create_primitive_args_or_get<::dnnl::binary>(
+      ::dnnl::algorithm::binary_mul, diff_dst_desc.get_desc(),
+      diff_dst_desc.get_desc(), diff_src_desc.get_desc());
+
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_0,
+             diff_dst_desc.get_desc(), diff_dst);
+  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_1,
+             diff_dst_desc.get_desc(), workspace);
+  insert_arg(primitive_args.second.args, DNNL_ARG_DST, diff_src_desc.get_desc(),
+             diff_src);
+
+  return exit_primitive(execute_primitive<::dnnl::binary>(primitive_args));
+}
+} // namespace dnnl
+} // namespace dpct
+
+#endif // __DPCT_DNNL_UTILS_HPP__
diff --git a/dpct/dpct.hpp b/dpct/dpct.hpp
new file mode 100644
index 0000000000000..8cc312f0ea31d
--- /dev/null
+++ b/dpct/dpct.hpp
@@ -0,0 +1,62 @@
+//==---- dpct.hpp ---------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_HPP__
+#define __DPCT_HPP__
+
+#include <sycl/sycl.hpp>
+#include <iostream>
+#include <limits.h>
+#include <math.h>
+
+template <class... Args> class dpct_kernel_name;
+template <int Arg> class dpct_kernel_scalar;
+
+#include "atomic.hpp"
+#include "device.hpp"
+#include "image.hpp"
+#include "kernel.hpp"
+#include "math.hpp"
+#include "memory.hpp"
+#include "util.hpp"
+
+#if defined(_MSC_VER)
+#define __dpct_align__(n) __declspec(align(n))
+#define __dpct_inline__ __forceinline
+#else
+#define __dpct_align__(n) __attribute__((aligned(n)))
+#define __dpct_inline__ __inline__ __attribute__((always_inline))
+#endif
+
+#if defined(_MSC_VER)
+#define __dpct_noinline__ __declspec(noinline)
+#else
+#define __dpct_noinline__ __attribute__((noinline))
+#endif
+
+#define DPCT_COMPATIBILITY_TEMP (900)
+
+namespace dpct{
+enum error_code { success = 0, default_error = 999 };
+}
+
+#define DPCT_CHECK_ERROR(expr)                                                 \
+  [&]() {                                                                      \
+    try {                                                                      \
+      expr;                                                                    \
+      return dpct::success;                                                    \
+    } catch (std::exception const &e) {                                        \
+      std::cerr << e.what() << std::endl;                                      \
+      return dpct::default_error;                                              \
+    }                                                                          \
+  }()
+
+#define DPCT_PI_F (3.14159274101257f)
+#define DPCT_PI (3.141592653589793115998)
+
+#endif // __DPCT_HPP__
diff --git a/dpct/dpl_extras/algorithm.h b/dpct/dpl_extras/algorithm.h
new file mode 100644
index 0000000000000..7c98b7a2282f9
--- /dev/null
+++ b/dpct/dpl_extras/algorithm.h
@@ -0,0 +1,2419 @@
+//==---- algorithm.h ------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_ALGORITHM_H__
+#define __DPCT_ALGORITHM_H__
+
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/numeric>
+
+#include "functional.h"
+#include "iterators.h"
+#include "vector.h"
+
+namespace dpct {
+
+template <typename Policy, typename Iter1, typename Iter2, typename Pred,
+          typename T>
+void replace_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p,
+                const T &new_value) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  std::transform(
+      std::forward<Policy>(policy), first, last, mask, first,
+      internal::replace_if_fun<typename std::iterator_traits<Iter1>::value_type,
+                               Pred>(p, new_value));
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Pred, typename T>
+Iter3 replace_copy_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
+                      Iter3 result, Pred p, const T &new_value) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  return std::transform(
+      std::forward<Policy>(policy), first, last, mask, result,
+      internal::replace_if_fun<typename std::iterator_traits<Iter3>::value_type,
+                               Pred>(p, new_value));
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Pred>
+internal::enable_if_hetero_execution_policy<Policy, Iter1>
+remove_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using oneapi::dpl::make_zip_iterator;
+  using policy_type = typename std::decay<Policy>::type;
+  using internal::__buffer;
+  using ValueType = typename std::iterator_traits<Iter1>::value_type;
+
+  __buffer<ValueType> _tmp(std::distance(first, last));
+
+  auto end = std::copy_if(
+      policy, make_zip_iterator(first, mask),
+      make_zip_iterator(last, mask + std::distance(first, last)),
+      make_zip_iterator(_tmp.get(), oneapi::dpl::discard_iterator()),
+      internal::negate_predicate_key_fun<Pred>(p));
+  return std::copy(std::forward<Policy>(policy), _tmp.get(),
+                   std::get<0>(end.base()), first);
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Pred>
+typename std::enable_if<!internal::is_hetero_execution_policy<
+                            typename std::decay<Policy>::type>::value,
+                        Iter1>::type
+remove_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using oneapi::dpl::make_zip_iterator;
+  using policy_type = typename std::decay<Policy>::type;
+  using ValueType = typename std::iterator_traits<Iter1>::value_type;
+
+  std::vector<ValueType> _tmp(std::distance(first, last));
+
+  auto end = std::copy_if(
+      policy, make_zip_iterator(first, mask),
+      make_zip_iterator(last, mask + std::distance(first, last)),
+      make_zip_iterator(_tmp.begin(), oneapi::dpl::discard_iterator()),
+      internal::negate_predicate_key_fun<Pred>(p));
+  return std::copy(policy, _tmp.begin(), std::get<0>(end.base()), first);
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Pred>
+Iter3 remove_copy_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
+                     Iter3 result, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using oneapi::dpl::make_zip_iterator;
+  auto ret_val = std::remove_copy_if(
+      std::forward<Policy>(policy), make_zip_iterator(first, mask),
+      make_zip_iterator(last, mask + std::distance(first, last)),
+      make_zip_iterator(result, oneapi::dpl::discard_iterator()),
+      internal::predicate_key_fun<Pred>(p));
+  return std::get<0>(ret_val.base());
+}
+
+template <class Policy, class Iter1, class Iter2, class BinaryPred>
+std::pair<Iter1, Iter2> unique(Policy &&policy, Iter1 keys_first,
+                               Iter1 keys_last, Iter2 values_first,
+                               BinaryPred binary_pred) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::unique(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first, values_first),
+      oneapi::dpl::make_zip_iterator(
+          keys_last, values_first + std::distance(keys_first, keys_last)),
+      internal::compare_key_fun<BinaryPred>(binary_pred));
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_first, values_first), ret_val);
+  return std::make_pair(keys_first + n1, values_first + n1);
+}
+
+template <class Policy, class Iter1, class Iter2>
+std::pair<Iter1, Iter2> unique(Policy &&policy, Iter1 keys_first,
+                               Iter1 keys_last, Iter2 values_first) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using T = typename std::iterator_traits<Iter1>::value_type;
+  return unique(std::forward<Policy>(policy), keys_first, keys_last,
+                values_first, std::equal_to<T>());
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class BinaryPred>
+std::pair<Iter3, Iter4> unique_copy(Policy &&policy, Iter1 keys_first,
+                                    Iter1 keys_last, Iter2 values_first,
+                                    Iter3 keys_result, Iter4 values_result,
+                                    BinaryPred binary_pred) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::unique_copy(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first, values_first),
+      oneapi::dpl::make_zip_iterator(
+          keys_last, values_first + std::distance(keys_first, keys_last)),
+      oneapi::dpl::make_zip_iterator(keys_result, values_result),
+      internal::unique_fun<BinaryPred>(binary_pred));
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
+  return std::make_pair(keys_result + n1, values_result + n1);
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4>
+std::pair<Iter3, Iter4> unique_copy(Policy &&policy, Iter1 keys_first,
+                                    Iter1 keys_last, Iter2 values_first,
+                                    Iter3 keys_result, Iter4 values_result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using T = typename std::iterator_traits<Iter1>::value_type;
+  auto comp = std::equal_to<T>();
+  return unique_copy(std::forward<Policy>(policy), keys_first, keys_last,
+                     values_first, keys_result, values_result, comp);
+}
+
+template <typename Policy, typename Iter, typename Pred>
+Iter partition_point(Policy &&policy, Iter first, Iter last, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
+                   std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  if (std::is_partitioned(policy, first, last, p))
+    return std::find_if_not(std::forward<Policy>(policy), first, last, p);
+  else
+    return first;
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Pred>
+Iter3 copy_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
+              Iter3 result, Pred pred) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::copy_if(
+      std::forward<Policy>(policy), oneapi::dpl::make_zip_iterator(first, mask),
+      oneapi::dpl::make_zip_iterator(last, mask + std::distance(first, last)),
+      oneapi::dpl::make_zip_iterator(result, oneapi::dpl::discard_iterator()),
+      internal::predicate_key_fun<Pred>(pred));
+  return std::get<0>(ret_val.base());
+}
+
+template <class Policy, class Iter1, class Iter2, class UnaryOperation,
+          class Pred>
+Iter2 transform_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 result,
+                   UnaryOperation unary_op, Pred pred) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using T = typename std::iterator_traits<Iter1>::value_type;
+  const auto n = std::distance(first, last);
+  std::for_each(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(first, result),
+      oneapi::dpl::make_zip_iterator(first, result) + n,
+      internal::transform_if_fun<T, Pred, UnaryOperation>(pred, unary_op));
+  return result + n;
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3,
+          class UnaryOperation, class Pred>
+Iter3 transform_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
+                   Iter3 result, UnaryOperation unary_op, Pred pred) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using T = typename std::iterator_traits<Iter1>::value_type;
+  using Ref1 = typename std::iterator_traits<Iter1>::reference;
+  using Ref2 = typename std::iterator_traits<Iter2>::reference;
+  const auto n = std::distance(first, last);
+  std::for_each(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(first, mask, result),
+      oneapi::dpl::make_zip_iterator(first, mask, result) + n,
+      internal::transform_if_unary_zip_mask_fun<T, Pred, UnaryOperation>(
+          pred, unary_op));
+  return result + n;
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class BinaryOperation, class Pred>
+Iter4 transform_if(Policy &&policy, Iter1 first1, Iter1 last1, Iter2 first2,
+                   Iter3 mask, Iter4 result, BinaryOperation binary_op,
+                   Pred pred) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  const auto n = std::distance(first1, last1);
+  using ZipIterator =
+      typename oneapi::dpl::zip_iterator<Iter1, Iter2, Iter3, Iter4>;
+  using T = typename std::iterator_traits<ZipIterator>::value_type;
+  std::for_each(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(first1, first2, mask, result),
+      oneapi::dpl::make_zip_iterator(last1, first2 + n, mask + n, result + n),
+      internal::transform_if_zip_mask_fun<T, Pred, BinaryOperation>(pred,
+                                                                    binary_op));
+  return result + n;
+}
+
+template <typename Policy, typename InputIter1, typename InputIter2,
+          typename OutputIter>
+void scatter(Policy &&policy, InputIter1 first, InputIter1 last, InputIter2 map,
+             OutputIter result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter2>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<OutputIter>::iterator_category,
+              std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  oneapi::dpl::copy(policy, first, last,
+                    oneapi::dpl::make_permutation_iterator(result, map));
+}
+
+template <typename Policy, typename InputIter1, typename InputIter2,
+          typename OutputIter>
+OutputIter gather(Policy &&policy, InputIter1 map_first, InputIter1 map_last,
+                  InputIter2 input_first, OutputIter result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter2>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<OutputIter>::iterator_category,
+              std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto perm_begin =
+      oneapi::dpl::make_permutation_iterator(input_first, map_first);
+  const int n = ::std::distance(map_first, map_last);
+
+  return oneapi::dpl::copy(policy, perm_begin, perm_begin + n, result);
+}
+
+template <typename Policy, typename InputIter1, typename InputIter2,
+          typename InputIter3, typename OutputIter, typename Predicate>
+void scatter_if(Policy &&policy, InputIter1 first, InputIter1 last,
+                InputIter2 map, InputIter3 mask, OutputIter result,
+                Predicate pred) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter2>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter3>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<OutputIter>::iterator_category,
+              std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  transform_if(
+      ::std::forward<Policy>(policy), first, last, mask,
+      oneapi::dpl::make_permutation_iterator(result, map),
+      [=](auto &&v) { return v; }, [=](auto &&m) { return pred(m); });
+}
+
+template <typename Policy, typename InputIter1, typename InputIter2,
+          typename InputIter3, typename OutputIter>
+void scatter_if(Policy &&policy, InputIter1 first, InputIter1 last,
+                InputIter2 map, InputIter3 mask, OutputIter result) {
+  scatter_if(::std::forward<Policy>(policy), first, last, map, mask, result,
+             internal::no_op_fun());
+}
+
+template <typename Policy, typename InputIter1, typename InputIter2,
+          typename InputIter3, typename OutputIter, typename Predicate>
+OutputIter gather_if(Policy &&policy, InputIter1 map_first, InputIter1 map_last,
+                     InputIter2 mask, InputIter3 input_first, OutputIter result,
+                     Predicate pred) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter2>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<InputIter3>::iterator_category,
+              std::random_access_iterator_tag>::value &&
+          std::is_same<
+              typename std::iterator_traits<OutputIter>::iterator_category,
+              std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto perm_begin =
+      oneapi::dpl::make_permutation_iterator(input_first, map_first);
+  const int n = std::distance(map_first, map_last);
+
+  return transform_if(
+      ::std::forward<Policy>(policy), perm_begin, perm_begin + n, mask, result,
+      [=](auto &&v) { return v; }, [=](auto &&m) { return pred(m); });
+}
+
+template <typename Policy, typename InputIter1, typename InputIter2,
+          typename InputIter3, typename OutputIter>
+OutputIter gather_if(Policy &&policy, InputIter1 map_first, InputIter1 map_last,
+                     InputIter2 mask, InputIter3 input_first,
+                     OutputIter result) {
+  return gather_if(::std::forward<Policy>(policy), map_first, map_last, mask,
+                   input_first, result, internal::no_op_fun());
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4, typename Iter5, typename Iter6>
+std::pair<Iter5, Iter6>
+merge(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, Iter2 keys_first2,
+      Iter2 keys_last2, Iter3 values_first1, Iter4 values_first2,
+      Iter5 keys_result, Iter6 values_result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto n1 = std::distance(keys_first1, keys_last1);
+  auto n2 = std::distance(keys_first2, keys_last2);
+  std::merge(std::forward<Policy>(policy),
+             oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+             oneapi::dpl::make_zip_iterator(keys_last1, values_first1 + n1),
+             oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
+             oneapi::dpl::make_zip_iterator(keys_last2, values_first2 + n2),
+             oneapi::dpl::make_zip_iterator(keys_result, values_result),
+             internal::compare_key_fun<>());
+  return std::make_pair(keys_result + n1 + n2, values_result + n1 + n2);
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4, typename Iter5, typename Iter6, typename Comp>
+std::pair<Iter5, Iter6>
+merge(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, Iter2 keys_first2,
+      Iter2 keys_last2, Iter3 values_first1, Iter4 values_first2,
+      Iter5 keys_result, Iter6 values_result, Comp comp) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto n1 = std::distance(keys_first1, keys_last1);
+  auto n2 = std::distance(keys_first2, keys_last2);
+  std::merge(std::forward<Policy>(policy),
+             oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+             oneapi::dpl::make_zip_iterator(keys_last1, values_first1 + n1),
+             oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
+             oneapi::dpl::make_zip_iterator(keys_last2, values_first2 + n2),
+             oneapi::dpl::make_zip_iterator(keys_result, values_result),
+             internal::compare_key_fun<Comp>(comp));
+  return std::make_pair(keys_result + n1 + n2, values_result + n1 + n2);
+}
+
+template <class Policy, class Iter, class T>
+void iota(Policy &&policy, Iter first, Iter last, T init, T step) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
+                   std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using DiffSize = typename std::iterator_traits<Iter>::difference_type;
+  std::transform(
+      std::forward<Policy>(policy), oneapi::dpl::counting_iterator<DiffSize>(0),
+      oneapi::dpl::counting_iterator<DiffSize>(std::distance(first, last)),
+      first, internal::sequence_fun<T>(init, step));
+}
+
+template <class Policy, class Iter, class T>
+void iota(Policy &&policy, Iter first, Iter last, T init) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
+                   std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  iota(std::forward<Policy>(policy), first, last, init, T(1));
+}
+
+template <class Policy, class Iter>
+void iota(Policy &&policy, Iter first, Iter last) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
+                   std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using DiffSize = typename std::iterator_traits<Iter>::difference_type;
+  iota(std::forward<Policy>(policy), first, last, DiffSize(0), DiffSize(1));
+}
+
+template <class Policy, class Iter1, class Iter2, class Comp>
+void sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last,
+          Iter2 values_first, Comp comp) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto first = oneapi::dpl::make_zip_iterator(keys_first, values_first);
+  auto last = first + std::distance(keys_first, keys_last);
+  std::sort(std::forward<Policy>(policy), first, last,
+            internal::compare_key_fun<Comp>(comp));
+}
+
+template <class Policy, class Iter1, class Iter2>
+void sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last,
+          Iter2 values_first) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  sort(std::forward<Policy>(policy), keys_first, keys_last, values_first,
+       internal::__less());
+}
+
+template <class Policy, class Iter1, class Iter2, class Comp>
+void stable_sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last,
+                 Iter2 values_first, Comp comp) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  std::stable_sort(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first, values_first),
+      oneapi::dpl::make_zip_iterator(
+          keys_last, values_first + std::distance(keys_first, keys_last)),
+      internal::compare_key_fun<Comp>(comp));
+}
+
+template <class Policy, class Iter1, class Iter2>
+void stable_sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last,
+                 Iter2 values_first) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  stable_sort(std::forward<Policy>(policy), keys_first, keys_last, values_first,
+              internal::__less());
+}
+
+template <class Policy, class Iter, class Operator>
+void for_each_index(Policy &&policy, Iter first, Iter last, Operator unary_op) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
+                   std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  using DiffSize = typename std::iterator_traits<Iter>::difference_type;
+  std::transform(
+      std::forward<Policy>(policy), oneapi::dpl::counting_iterator<DiffSize>(0),
+      oneapi::dpl::counting_iterator<DiffSize>(std::distance(first, last)),
+      first, unary_op);
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class Iter5>
+std::pair<Iter4, Iter5>
+set_intersection(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
+                 Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
+                 Iter4 keys_result, Iter5 values_result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::set_intersection(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+      oneapi::dpl::make_zip_iterator(
+          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
+      oneapi::dpl::make_zip_iterator(keys_first2,
+                                     oneapi::dpl::discard_iterator()),
+      oneapi::dpl::make_zip_iterator(keys_last2,
+                                     oneapi::dpl::discard_iterator()),
+      oneapi::dpl::make_zip_iterator(keys_result, values_result),
+      internal::compare_key_fun<>());
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
+  return std::make_pair(keys_result + n1, values_result + n1);
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class Iter5, class Comp>
+std::pair<Iter4, Iter5>
+set_intersection(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
+                 Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
+                 Iter4 keys_result, Iter5 values_result, Comp comp) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::set_intersection(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+      oneapi::dpl::make_zip_iterator(
+          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
+      oneapi::dpl::make_zip_iterator(keys_first2,
+                                     oneapi::dpl::discard_iterator()),
+      oneapi::dpl::make_zip_iterator(keys_last2,
+                                     oneapi::dpl::discard_iterator()),
+      oneapi::dpl::make_zip_iterator(keys_result, values_result),
+      internal::compare_key_fun<Comp>(comp));
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
+  return std::make_pair(keys_result + n1, values_result + n1);
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class Iter5, class Iter6>
+std::pair<Iter5, Iter6>
+set_symmetric_difference(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
+                         Iter2 keys_first2, Iter2 keys_last2,
+                         Iter3 values_first1, Iter4 values_first2,
+                         Iter5 keys_result, Iter6 values_result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::set_symmetric_difference(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+      oneapi::dpl::make_zip_iterator(
+          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
+      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
+      oneapi::dpl::make_zip_iterator(
+          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
+      oneapi::dpl::make_zip_iterator(keys_result, values_result),
+      internal::compare_key_fun<>());
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
+  return std::make_pair(keys_result + n1, values_result + n1);
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class Iter5, class Iter6, class Comp>
+std::pair<Iter5, Iter6>
+set_symmetric_difference(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
+                         Iter2 keys_first2, Iter2 keys_last2,
+                         Iter3 values_first1, Iter4 values_first2,
+                         Iter5 keys_result, Iter6 values_result, Comp comp) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::set_symmetric_difference(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+      oneapi::dpl::make_zip_iterator(
+          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
+      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
+      oneapi::dpl::make_zip_iterator(
+          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
+      oneapi::dpl::make_zip_iterator(keys_result, values_result),
+      internal::compare_key_fun<Comp>(comp));
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
+  return std::make_pair(keys_result + n1, values_result + n1);
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class Iter5, class Iter6>
+std::pair<Iter5, Iter6>
+set_difference(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
+               Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
+               Iter4 values_first2, Iter5 keys_result, Iter6 values_result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::set_difference(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+      oneapi::dpl::make_zip_iterator(
+          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
+      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
+      oneapi::dpl::make_zip_iterator(
+          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
+      oneapi::dpl::make_zip_iterator(keys_result, values_result),
+      internal::compare_key_fun<>());
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
+  return std::make_pair(keys_result + n1, values_result + n1);
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class Iter5, class Iter6, class Comp>
+std::pair<Iter5, Iter6> set_difference(Policy &&policy, Iter1 keys_first1,
+                                       Iter1 keys_last1, Iter2 keys_first2,
+                                       Iter2 keys_last2, Iter3 values_first1,
+                                       Iter4 values_first2, Iter5 keys_result,
+                                       Iter6 values_result, Comp comp) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::set_difference(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+      oneapi::dpl::make_zip_iterator(
+          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
+      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
+      oneapi::dpl::make_zip_iterator(
+          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
+      oneapi::dpl::make_zip_iterator(keys_result, values_result),
+      internal::compare_key_fun<Comp>(comp));
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
+  return std::make_pair(keys_result + n1, values_result + n1);
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class Iter5, class Iter6>
+internal::enable_if_execution_policy<Policy, std::pair<Iter5, Iter6>>
+set_union(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
+          Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
+          Iter4 values_first2, Iter5 keys_result, Iter6 values_result) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::set_union(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+      oneapi::dpl::make_zip_iterator(
+          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
+      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
+      oneapi::dpl::make_zip_iterator(
+          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
+      oneapi::dpl::make_zip_iterator(keys_result, values_result),
+      internal::compare_key_fun<>());
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
+  return std::make_pair(keys_result + n1, values_result + n1);
+}
+
+template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
+          class Iter5, class Iter6, class Comp>
+internal::enable_if_execution_policy<Policy, std::pair<Iter5, Iter6>>
+set_union(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
+          Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
+          Iter4 values_first2, Iter5 keys_result, Iter6 values_result,
+          Comp comp) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::set_union(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
+      oneapi::dpl::make_zip_iterator(
+          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
+      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
+      oneapi::dpl::make_zip_iterator(
+          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
+      oneapi::dpl::make_zip_iterator(keys_result, values_result),
+      internal::compare_key_fun<Comp>(comp));
+  auto n1 = std::distance(
+      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
+  return std::make_pair(keys_result + n1, values_result + n1);
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4, typename Pred>
+internal::enable_if_execution_policy<Policy, std::pair<Iter3, Iter4>>
+stable_partition_copy(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
+                      Iter3 out_true, Iter4 out_false, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  auto ret_val = std::partition_copy(
+      std::forward<Policy>(policy), oneapi::dpl::make_zip_iterator(first, mask),
+      oneapi::dpl::make_zip_iterator(last, mask + std::distance(first, last)),
+      oneapi::dpl::make_zip_iterator(out_true, oneapi::dpl::discard_iterator()),
+      oneapi::dpl::make_zip_iterator(out_false,
+                                     oneapi::dpl::discard_iterator()),
+      internal::predicate_key_fun<Pred>(p));
+  return std::make_pair(std::get<0>(ret_val.first.base()),
+                        std::get<0>(ret_val.second.base()));
+}
+
+template <typename Policy, typename Iter1, typename Iter3, typename Iter4,
+          typename Pred>
+internal::enable_if_execution_policy<Policy, std::pair<Iter3, Iter4>>
+stable_partition_copy(Policy &&policy, Iter1 first, Iter1 last, Iter3 out_true,
+                      Iter4 out_false, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  return std::partition_copy(std::forward<Policy>(policy), first, last,
+                             out_true, out_false, p);
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4, typename Pred>
+internal::enable_if_execution_policy<Policy, std::pair<Iter3, Iter4>>
+partition_copy(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
+               Iter3 out_true, Iter4 out_false, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
+                       std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  return stable_partition_copy(std::forward<Policy>(policy), first, last, mask,
+                               out_true, out_false, p);
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Pred>
+internal::enable_if_hetero_execution_policy<Policy, Iter1>
+stable_partition(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  typedef typename std::decay<Policy>::type policy_type;
+  internal::__buffer<typename std::iterator_traits<Iter1>::value_type> _tmp(
+      std::distance(first, last));
+
+  std::copy(policy, mask, mask + std::distance(first, last), _tmp.get());
+
+  auto ret_val =
+      std::stable_partition(std::forward<Policy>(policy),
+                            oneapi::dpl::make_zip_iterator(first, _tmp.get()),
+                            oneapi::dpl::make_zip_iterator(
+                                last, _tmp.get() + std::distance(first, last)),
+                            internal::predicate_key_fun<Pred>(p));
+  return std::get<0>(ret_val.base());
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Pred>
+typename std::enable_if<!internal::is_hetero_execution_policy<
+                            typename std::decay<Policy>::type>::value,
+                        Iter1>::type
+stable_partition(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  typedef typename std::decay<Policy>::type policy_type;
+  std::vector<typename std::iterator_traits<Iter1>::value_type> _tmp(
+      std::distance(first, last));
+
+  std::copy(policy, mask, mask + std::distance(first, last), _tmp.begin());
+
+  auto ret_val = std::stable_partition(
+      std::forward<Policy>(policy),
+      oneapi::dpl::make_zip_iterator(first, _tmp.begin()),
+      oneapi::dpl::make_zip_iterator(last,
+                                     _tmp.begin() + std::distance(first, last)),
+      internal::predicate_key_fun<Pred>(p));
+  return std::get<0>(ret_val.base());
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Pred>
+internal::enable_if_execution_policy<Policy, Iter1>
+partition(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
+  static_assert(
+      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
+                   std::random_access_iterator_tag>::value &&
+          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
+                       std::random_access_iterator_tag>::value,
+      "Iterators passed to algorithms must be random-access iterators.");
+  return stable_partition(std::forward<Policy>(policy), first, last, mask, p);
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4>
+inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
+                          dpct::internal::is_iterator<Iter2>::value &&
+                          dpct::internal::is_iterator<Iter3>::value &&
+                          dpct::internal::is_iterator<Iter4>::value>
+sort_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
+           Iter4 values_out, ::std::int64_t n, bool descending = false,
+           int begin_bit = 0,
+           int end_bit =
+               sizeof(typename ::std::iterator_traits<Iter1>::value_type) * 8);
+
+template <typename Policy, typename Iter1, typename Iter2>
+inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
+                          dpct::internal::is_iterator<Iter2>::value>
+sort_keys(Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
+          bool descending = false, int begin_bit = 0,
+          int end_bit =
+              sizeof(typename ::std::iterator_traits<Iter1>::value_type) * 8);
+
+namespace internal {
+
+// Transforms key to a specific bit range and sorts the transformed key
+template <typename Policy, typename Iter1, typename Iter2,
+          typename TransformedKeyT>
+inline void transform_and_sort(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
+                               ::std::int64_t n, bool descending, int begin_bit,
+                               int end_bit) {
+  using key_t_value_t = typename std::iterator_traits<Iter1>::value_type;
+  auto trans_key =
+      translate_key<key_t_value_t, TransformedKeyT>(begin_bit, end_bit);
+
+  // Use of the comparison operator that is not simply std::greater() or
+  // std::less() will result in
+  //  not using radix sort which will cost some performance.  However, this is
+  //  necessary to provide the transformation of the key to the bitrange
+  //  desired.
+  auto partial_sort_with_comp = [&](const auto &comp) {
+    return oneapi::dpl::partial_sort_copy(
+        std::forward<Policy>(policy), keys_in, keys_in + n, keys_out,
+        keys_out + n, [=](const auto a, const auto b) {
+          return comp(trans_key(a), trans_key(b));
+        });
+  };
+  if (descending)
+    partial_sort_with_comp(::std::greater<TransformedKeyT>());
+  else
+    partial_sort_with_comp(::std::less<TransformedKeyT>());
+}
+
+template <typename Policy, typename Iter1, typename Iter2>
+inline void sort_only(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
+                      ::std::int64_t n, bool descending) {
+  using key_t_value_t = typename ::std::iterator_traits<Iter1>::value_type;
+
+  if constexpr (::std::is_floating_point<key_t_value_t>::value) {
+    if (descending) {
+      // Comparison operator that is not std::greater() ensures stability of
+      // -0.0 and 0.0
+      // at the cost of some performance because radix sort will not be used.
+      auto comp_descending = [=](const auto a, const auto b) { return a > b; };
+
+      oneapi::dpl::partial_sort_copy(::std::forward<Policy>(policy), keys_in,
+                                     keys_in + n, keys_out, keys_out + n,
+                                     comp_descending);
+    } else {
+      // Comparison operator that is not std::less() ensures stability of -0.0
+      // and 0.0
+      // at the cost of some performance because radix sort will not be used.
+      auto comp_ascending = [=](const auto a, const auto b) { return a < b; };
+
+      oneapi::dpl::partial_sort_copy(::std::forward<Policy>(policy), keys_in,
+                                     keys_in + n, keys_out, keys_out + n,
+                                     comp_ascending);
+    }
+  } else {
+    if (descending) {
+      oneapi::dpl::partial_sort_copy(::std::forward<Policy>(policy), keys_in,
+                                     keys_in + n, keys_out, keys_out + n,
+                                     ::std::greater<key_t_value_t>());
+    } else {
+
+      oneapi::dpl::partial_sort_copy(::std::forward<Policy>(policy), keys_in,
+                                     keys_in + n, keys_out, keys_out + n);
+    }
+  }
+}
+
+// Transforms key from a pair to a specific bit range and sorts the pairs by the
+// transformed key
+template <typename Policy, typename Iter1, typename Iter2,
+          typename TransformedKeyT, typename Iter3, typename Iter4>
+inline void
+transform_and_sort_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
+                         Iter3 values_in, Iter4 values_out, ::std::int64_t n,
+                         bool descending, int begin_bit, int end_bit) {
+  using key_t_value_t = typename std::iterator_traits<Iter1>::value_type;
+  auto zip_input = oneapi::dpl::zip_iterator(keys_in, values_in);
+  auto zip_output = oneapi::dpl::zip_iterator(keys_out, values_out);
+  auto trans_key =
+      translate_key<key_t_value_t, TransformedKeyT>(begin_bit, end_bit);
+
+  // Use of the comparison operator that is not simply std::greater() or
+  // std::less() will result in
+  //  not using radix sort which will cost some performance.  However, this is
+  //  necessary to provide the transformation of the key to the bitrange desired
+  //  and also to select the key from the zipped pair.
+  auto load_val = [=](const auto a) { return trans_key(std::get<0>(a)); };
+
+  auto partial_sort_with_comp = [&](const auto &comp) {
+    return oneapi::dpl::partial_sort_copy(
+        std::forward<Policy>(policy), zip_input, zip_input + n, zip_output,
+        zip_output + n, [=](const auto a, const auto b) {
+          return comp(load_val(a), load_val(b));
+        });
+  };
+  if (descending)
+    partial_sort_with_comp(::std::greater<key_t_value_t>());
+  else
+    partial_sort_with_comp(::std::less<key_t_value_t>());
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4>
+inline void sort_only_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
+                            Iter3 values_in, Iter4 values_out, ::std::int64_t n,
+                            bool descending) {
+  using key_t_value_t = typename ::std::iterator_traits<Iter1>::value_type;
+  auto zip_input = oneapi::dpl::zip_iterator(keys_in, values_in);
+  auto zip_output = oneapi::dpl::zip_iterator(keys_out, values_out);
+
+  // Use of the comparison operator that is not simply std::greater() or
+  // std::less() will result in
+  //  not using radix sort which will cost some performance.  However, this is
+  //  necessary to select the key from the zipped pair.
+  auto load_val = [=](const auto a) { return std::get<0>(a); };
+
+  auto partial_sort_with_comp = [&](const auto &comp) {
+    return oneapi::dpl::partial_sort_copy(
+        std::forward<Policy>(policy), zip_input, zip_input + n, zip_output,
+        zip_output + n, [=](const auto a, const auto b) {
+          return comp(load_val(a), load_val(b));
+        });
+  };
+  if (descending)
+    partial_sort_with_comp(::std::greater<key_t_value_t>());
+  else
+    partial_sort_with_comp(::std::less<key_t_value_t>());
+}
+
+// overload for Iter2 != std::nullptr_t
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4>
+typename ::std::enable_if<!::std::is_null_pointer<Iter2>::value>::type
+sort_pairs_impl(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
+                Iter4 values_out, ::std::int64_t n, bool descending,
+                int begin_bit, int end_bit) {
+  using key_t_value_t = typename ::std::iterator_traits<Iter1>::value_type;
+
+  int clipped_begin_bit = ::std::max(begin_bit, 0);
+  int clipped_end_bit =
+      ::std::min((::std::uint64_t)end_bit, sizeof(key_t_value_t) * 8);
+  int num_bytes = (clipped_end_bit - clipped_begin_bit - 1) / 8 + 1;
+
+  auto transform_and_sort_pairs_f = [&](auto x) {
+    using T = typename ::std::decay_t<decltype(x)>;
+    internal::transform_and_sort_pairs<decltype(policy), Iter1, Iter2, T, Iter3,
+                                       Iter4>(
+        ::std::forward<Policy>(policy), keys_in, keys_out, values_in,
+        values_out, n, descending, clipped_begin_bit, clipped_end_bit);
+  };
+
+  if (clipped_end_bit - clipped_begin_bit == sizeof(key_t_value_t) * 8) {
+    internal::sort_only_pairs(::std::forward<Policy>(policy), keys_in, keys_out,
+                              values_in, values_out, n, descending);
+  } else if (num_bytes == 1) {
+    transform_and_sort_pairs_f.template operator()<uint8_t>(0);
+  } else if (num_bytes == 2) {
+    transform_and_sort_pairs_f.template operator()<uint16_t>(0);
+  } else if (num_bytes <= 4) {
+    transform_and_sort_pairs_f.template operator()<uint32_t>(0);
+  } else // if (num_bytes <= 8)
+  {
+    transform_and_sort_pairs_f.template operator()<::std::uint64_t>(0);
+  }
+}
+
+// overload for Iter2 == std::nullptr_t
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4>
+typename ::std::enable_if<::std::is_null_pointer<Iter2>::value>::type
+sort_pairs_impl(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
+                Iter4 values_out, ::std::int64_t n, bool descending,
+                int begin_bit, int end_bit) {
+  // create temporary keys_out to discard, memory footprint could be improved by
+  // a specialized iterator with a single
+  // unchanging dummy Iter1 element
+  using key_t_value_t = typename std::iterator_traits<Iter1>::value_type;
+  sycl::buffer<key_t_value_t, 1> temp_keys_out{sycl::range<1>(n)};
+  internal::sort_pairs_impl(std::forward<Policy>(policy), keys_in,
+                            oneapi::dpl::begin(temp_keys_out), values_in,
+                            values_out, n, descending, begin_bit, end_bit);
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4, typename Iter5>
+inline void segmented_sort_pairs_by_parallel_sorts(
+    Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter4 values_in,
+    Iter3 values_out, ::std::int64_t n, ::std::int64_t nsegments,
+    Iter5 begin_offsets, Iter5 end_offsets, bool descending = false,
+    int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  using offset_type = typename ::std::iterator_traits<Iter5>::value_type;
+  ::std::vector<offset_type> host_accessible_offset_starts(nsegments);
+  ::std::vector<offset_type> host_accessible_offset_ends(nsegments);
+  // make offsets accessible on host
+  ::std::copy(policy, begin_offsets, begin_offsets + nsegments,
+              host_accessible_offset_starts.begin());
+  ::std::copy(policy, end_offsets, end_offsets + nsegments,
+              host_accessible_offset_ends.begin());
+
+  for (::std::uint64_t i = 0; i < nsegments; i++) {
+    ::std::uint64_t segment_begin = host_accessible_offset_starts[i];
+    ::std::uint64_t segment_end =
+        ::std::min(n, (::std::int64_t)host_accessible_offset_ends[i]);
+    if (segment_begin < segment_end) {
+      ::dpct::sort_pairs(
+          policy, keys_in + segment_begin, keys_out + segment_begin,
+          values_in + segment_begin, values_out + segment_begin,
+          segment_end - segment_begin, descending, begin_bit, end_bit);
+    }
+  }
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
+inline void segmented_sort_keys_by_parallel_sorts(
+    Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
+    ::std::int64_t nsegments, Iter3 begin_offsets, Iter3 end_offsets,
+    bool descending = false, int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  using offset_type = typename ::std::iterator_traits<Iter3>::value_type;
+  ::std::vector<offset_type> host_accessible_offset_starts(nsegments);
+  ::std::vector<offset_type> host_accessible_offset_ends(nsegments);
+  // make offsets accessible on host
+  ::std::copy(policy, begin_offsets, begin_offsets + nsegments,
+              host_accessible_offset_starts.begin());
+  ::std::copy(policy, end_offsets, end_offsets + nsegments,
+              host_accessible_offset_ends.begin());
+
+  for (::std::uint64_t i = 0; i < nsegments; i++) {
+    ::std::uint64_t segment_begin = host_accessible_offset_starts[i];
+    ::std::uint64_t segment_end =
+        ::std::min(n, (::std::int64_t)host_accessible_offset_ends[i]);
+    if (segment_begin < segment_end) {
+      ::dpct::sort_keys(policy, keys_in + segment_begin,
+                        keys_out + segment_begin, segment_end - segment_begin,
+                        descending, begin_bit, end_bit);
+    }
+  }
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4, typename Iter5>
+inline void segmented_sort_pairs_by_parallel_for_of_sorts(
+    Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
+    Iter4 values_out, ::std::int64_t n, ::std::int64_t nsegments,
+    Iter5 begin_offsets, Iter5 end_offsets, bool descending = false,
+    int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  policy.queue().submit([&](sycl::handler &cgh) {
+    cgh.parallel_for(nsegments, [=](sycl::id<1> i) {
+      ::std::uint64_t segment_begin = begin_offsets[i];
+      ::std::uint64_t segment_end =
+          ::std::min(n, (::std::int64_t)end_offsets[i]);
+      if (segment_begin == segment_end) {
+        return;
+      }
+      ::dpct::sort_pairs(::std::execution::seq, keys_in + segment_begin,
+                         keys_out + segment_begin, values_in + segment_begin,
+                         values_out + segment_begin,
+                         segment_end - segment_begin, descending, begin_bit,
+                         end_bit);
+    });
+  });
+  policy.queue().wait();
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
+inline void segmented_sort_keys_by_parallel_for_of_sorts(
+    Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
+    ::std::int64_t nsegments, Iter3 begin_offsets, Iter3 end_offsets,
+    bool descending = false, int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  policy.queue().submit([&](sycl::handler &cgh) {
+    cgh.parallel_for(nsegments, [=](sycl::id<1> i) {
+      ::std::uint64_t segment_begin = begin_offsets[i];
+      ::std::uint64_t segment_end =
+          ::std::min(n, (::std::int64_t)end_offsets[i]);
+      if (segment_begin == segment_end) {
+        return;
+      }
+      ::dpct::sort_keys(::std::execution::seq, keys_in + segment_begin,
+                        keys_out + segment_begin, segment_end - segment_begin,
+                        descending, begin_bit, end_bit);
+    });
+  });
+  policy.queue().wait();
+}
+
+template <typename Policy, typename OffsetIteratorT>
+inline void mark_segments(Policy &&policy, OffsetIteratorT begin_offsets,
+                          OffsetIteratorT end_offsets, ::std::int64_t n,
+                          ::std::int64_t nsegments,
+                          sycl::buffer<::std::size_t, 1> segments) {
+
+  ::std::size_t work_group_size =
+      policy.queue()
+          .get_device()
+          .template get_info<sycl::info::device::max_work_group_size>();
+
+  auto sg_sizes = policy.queue()
+                      .get_device()
+                      .template get_info<sycl::info::device::sub_group_sizes>();
+  ::std::size_t sub_group_size = sg_sizes.empty() ? 0 : sg_sizes.back();
+
+  float avg_seg_size = (float)n / (float)nsegments;
+  if (avg_seg_size > work_group_size) {
+    // If average segment size is larger than workgroup, use workgroup to
+    // coordinate to mark segments
+    policy.queue()
+        .submit([&](sycl::handler &h) {
+          auto segments_acc = segments.get_access<sycl::access_mode::write>(h);
+          h.parallel_for(work_group_size, ([=](sycl::id<1> id) {
+                           for (::std::size_t seg = 0; seg < nsegments; seg++) {
+                             ::std::size_t i = begin_offsets[seg];
+                             ::std::size_t end = end_offsets[seg];
+                             while (i + id < end) {
+                               segments_acc[i + id] = seg;
+                               i += work_group_size;
+                             }
+                           }
+                         }));
+        })
+        .wait();
+  } else if (sub_group_size > 0 && avg_seg_size > sub_group_size / 2) {
+    // If average segment size is larger than half a subgroup, use subgroup to
+    // coordinate to mark segments
+    policy.queue()
+        .submit([&](sycl::handler &h) {
+          auto segments_acc = segments.get_access<sycl::access_mode::write>(h);
+          h.parallel_for(
+              sycl::nd_range<1>{work_group_size, work_group_size},
+              ([=](sycl::nd_item<1> item) {
+                auto sub_group = item.get_sub_group();
+                ::std::size_t num_subgroups =
+                    sub_group.get_group_range().size();
+                ::std::size_t local_size = sub_group.get_local_range().size();
+
+                ::std::size_t sub_group_id = sub_group.get_group_id();
+                while (sub_group_id < nsegments) {
+                  ::std::size_t subgroup_local_id = sub_group.get_local_id();
+                  ::std::size_t i = begin_offsets[sub_group_id];
+                  ::std::size_t end = end_offsets[sub_group_id];
+                  while (i + subgroup_local_id < end) {
+                    segments_acc[i + subgroup_local_id] = sub_group_id;
+                    i += local_size;
+                  }
+                  sub_group_id += num_subgroups;
+                }
+              }));
+        })
+        .wait();
+  } else {
+    // If average segment size is small as compared to subgroup, use single
+    // work item to mark each segment
+    policy.queue()
+        .submit([&](sycl::handler &h) {
+          auto segments_acc = segments.get_access<sycl::access_mode::write>(h);
+          h.parallel_for(nsegments, ([=](sycl::id<1> seg) {
+                           for (::std::size_t i = begin_offsets[seg];
+                                i < end_offsets[seg]; i++) {
+                             segments_acc[i] = seg;
+                           }
+                         }));
+        })
+        .wait();
+  }
+}
+
+// The dpl_histogram namespace contains a temporary preview of an upcoming
+// oneDPL histogram API.  This namespace will be removed and replaced with
+// corresponding calls to oneapi::dpl::histogram()
+namespace dpl_histogram {
+
+template <typename T1, typename T2>
+constexpr inline auto __ceiling_div(const T1 &number, const T2 &divisor) {
+  return (number - 1) / divisor + 1;
+}
+
+template <typename T1, bool _IsFloatingPoint>
+struct __evenly_divided_binhash_impl {};
+
+template <typename T>
+struct __evenly_divided_binhash_impl<T, /* is_floating_point = */ true> {
+  T __minimum;
+  ::std::uint32_t __num_bins;
+  T __scale;
+  T __maximum;
+  __evenly_divided_binhash_impl(const T &min, const T &max,
+                                const ::std::uint32_t &num_bins)
+      : __minimum(min), __maximum(max), __num_bins(num_bins),
+        __scale(T(num_bins) / (max - min)) {}
+  template <typename T2> std::uint32_t operator()(T2 &&value) const {
+    return ::std::uint32_t((::std::forward<T2>(value) - __minimum) * __scale);
+  }
+
+  template <typename T2> bool is_valid(const T2 &value) const {
+    return value >= __minimum && value < __maximum;
+  }
+};
+
+// non floating point type
+template <typename T>
+struct __evenly_divided_binhash_impl<T, /* is_floating_point= */ false> {
+  T __minimum;
+  ::std::uint32_t __num_bins;
+  T __range_size;
+  __evenly_divided_binhash_impl(const T &min, const T &max,
+                                const ::std::uint32_t &num_bins)
+      : __minimum(min), __num_bins(num_bins), __range_size(max - min) {}
+  template <typename T2>::std::uint32_t operator()(T2 &&value) const {
+    return ::std::uint32_t(
+        ((::std::uint64_t(::std::forward<T2>(value)) - __minimum) *
+         ::std::uint64_t(__num_bins)) /
+        __range_size);
+  }
+
+  template <typename T2> bool is_valid(const T2 &value) const {
+    return value >= __minimum && value < __minimum + __range_size;
+  }
+};
+
+template <typename T1>
+using __evenly_divided_binhash =
+    __evenly_divided_binhash_impl<T1, ::std::is_floating_point_v<T1>>;
+
+template <typename Range> struct __custom_range_binhash {
+  Range __boundaries;
+  __custom_range_binhash(Range boundaries) : __boundaries(boundaries) {}
+
+  template <typename T>::std::uint32_t operator()(T &&value) const {
+    return (::std::upper_bound(__boundaries.begin(), __boundaries.end(),
+                               ::std::forward<T>(value)) -
+            __boundaries.begin()) -
+           1;
+  }
+
+  template <typename T2> bool is_valid(const T2 &value) const {
+    return value >= __boundaries[0] &&
+           value < __boundaries[__boundaries.size() - 1];
+  }
+};
+
+template <typename HistAccessor, typename OffsetT, typename Size>
+inline void __clear_wglocal_histograms(const HistAccessor &local_histogram,
+                                       const OffsetT &offset,
+                                       const Size &num_bins,
+                                       const sycl::nd_item<1> &self_item) {
+  ::std::uint32_t gSize = self_item.get_local_range()[0];
+  ::std::uint32_t self_lidx = self_item.get_local_id(0);
+  ::std::uint8_t factor = __ceiling_div(num_bins, gSize);
+  ::std::uint8_t k;
+  _DPCT_PRAGMA_UNROLL
+  for (k = 0; k < factor - 1; k++) {
+    local_histogram[offset + gSize * k + self_lidx] = 0;
+  }
+  if (gSize * k + self_lidx < num_bins) {
+    local_histogram[offset + gSize * k + self_lidx] = 0;
+  }
+  self_item.barrier(sycl::access::fence_space::local_space);
+}
+
+template <typename BinIdxType, typename Iter1, typename HistReg,
+          typename BinFunc>
+inline void __accum_local_register_iter(const Iter1 &in_acc,
+                                        const ::std::size_t &index,
+                                        HistReg *histogram, BinFunc func) {
+  const auto &x = in_acc[index];
+  if (func.is_valid(x)) {
+    BinIdxType c = func(x);
+    histogram[c]++;
+  }
+}
+
+template <typename BinIdxType, sycl::access::address_space AddressSpace,
+          typename Iter1, typename HistAccessor, typename OffsetT,
+          typename BinFunc>
+inline void __accum_local_atomics_iter(const Iter1 &in_acc,
+                                       const ::std::size_t &index,
+                                       const HistAccessor &wg_local_histogram,
+                                       const OffsetT &offset, BinFunc func) {
+  using __histo_value_type = typename HistAccessor::value_type;
+  const auto &x = in_acc[index];
+  if (func.is_valid(x)) {
+    BinIdxType c = func(x);
+    sycl::atomic_ref<__histo_value_type, sycl::memory_order::relaxed,
+                     sycl::memory_scope::work_group, AddressSpace>
+        local_bin(wg_local_histogram[offset + c]);
+    local_bin++;
+  }
+}
+
+template <typename BinType, typename HistAccessorIn, typename OffsetT,
+          typename HistAccessorOut, typename Size>
+inline void __reduce_out_histograms(const HistAccessorIn &in_histogram,
+                                    const OffsetT &offset,
+                                    const HistAccessorOut &out_histogram,
+                                    const Size &num_bins,
+                                    const sycl::nd_item<1> &self_item) {
+  ::std::uint32_t gSize = self_item.get_local_range()[0];
+  ::std::uint32_t self_lidx = self_item.get_local_id(0);
+  ::std::uint8_t factor = __ceiling_div(num_bins, gSize);
+  ::std::uint8_t k;
+
+  _DPCT_PRAGMA_UNROLL
+  for (k = 0; k < factor - 1; k++) {
+    sycl::atomic_ref<BinType, sycl::memory_order::relaxed,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space>
+        global_bin(out_histogram[gSize * k + self_lidx]);
+    global_bin += in_histogram[offset + gSize * k + self_lidx];
+  }
+  if (gSize * k + self_lidx < num_bins) {
+    sycl::atomic_ref<BinType, sycl::memory_order::relaxed,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space>
+        global_bin(out_histogram[gSize * k + self_lidx]);
+    global_bin += in_histogram[offset + gSize * k + self_lidx];
+  }
+}
+
+template <::std::uint16_t ItersPerWorkItem, ::std::uint8_t BinsPerWorkItem,
+          typename BinType, typename Policy, typename Range1, typename Range2,
+          typename Size, typename IdxHashFunc, typename... Range3>
+inline void __histogram_general_registers_local_reduction(
+    Policy &&policy, ::std::uint16_t work_group_size, Range1 &&input,
+    Range2 &&bins, const Size &num_bins, IdxHashFunc func,
+    Range3 &&...opt_range) {
+  const ::std::size_t N = input.size();
+  using __local_histogram_type = ::std::uint32_t;
+  using __private_histogram_type = ::std::uint16_t;
+
+  ::std::size_t segments = __ceiling_div(N, work_group_size * ItersPerWorkItem);
+  auto e = policy.queue().submit([&](auto &h) {
+    // Temporary use of stable non-public API from oneDPL,  this function will
+    // be replaced with oneDPL call in an upcoming release.
+    oneapi::dpl::__ranges::__require_access(h, input, bins, opt_range...);
+    sycl::local_accessor<__local_histogram_type, 1> local_histogram(
+        sycl::range(num_bins), h);
+    h.parallel_for(
+        sycl::nd_range<1>(segments * work_group_size, work_group_size),
+        [=](sycl::nd_item<1> __self_item) {
+          using __bin_idx_type = ::std::uint8_t;
+          const ::std::size_t __self_lidx = __self_item.get_local_id(0);
+          const ::std::size_t __wgroup_idx = __self_item.get_group(0);
+          const ::std::size_t __seg_start =
+              work_group_size * ItersPerWorkItem * __wgroup_idx;
+
+          __clear_wglocal_histograms(local_histogram, 0, num_bins, __self_item);
+          __private_histogram_type histogram[BinsPerWorkItem];
+          _DPCT_PRAGMA_UNROLL
+          for (::std::uint8_t k = 0; k < BinsPerWorkItem; k++) {
+            histogram[k] = 0;
+          }
+
+          if (__seg_start + work_group_size * ItersPerWorkItem < N) {
+            _DPCT_PRAGMA_UNROLL
+            for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) {
+              __accum_local_register_iter<__bin_idx_type>(
+                  input, __seg_start + idx * work_group_size + __self_lidx,
+                  histogram, func);
+            }
+          } else {
+            _DPCT_PRAGMA_UNROLL
+            for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) {
+              ::std::size_t __val_idx =
+                  __seg_start + idx * work_group_size + __self_lidx;
+              if (__val_idx < N) {
+                __accum_local_register_iter<__bin_idx_type>(input, __val_idx,
+                                                            histogram, func);
+              }
+            }
+          }
+
+          _DPCT_PRAGMA_UNROLL
+          for (::std::uint8_t k = 0; k < num_bins; k++) {
+            sycl::atomic_ref<__local_histogram_type,
+                             sycl::memory_order::relaxed,
+                             sycl::memory_scope::work_group,
+                             sycl::access::address_space::local_space>
+                local_bin(local_histogram[k]);
+            local_bin += histogram[k];
+          }
+
+          __self_item.barrier(sycl::access::fence_space::local_space);
+
+          __reduce_out_histograms<BinType>(local_histogram, 0, bins, num_bins,
+                                           __self_item);
+        });
+  });
+  e.wait();
+}
+
+template <::std::uint16_t ItersPerWorkItem, typename BinType, typename Policy,
+          typename Range1, typename Range2, typename Size, typename IdxHashFunc,
+          typename... Range3>
+inline void __histogram_general_local_atomics(Policy &&policy,
+                                              ::std::uint16_t work_group_size,
+                                              Range1 &&input, Range2 &&bins,
+                                              const Size &num_bins,
+                                              IdxHashFunc func,
+                                              Range3 &&...opt_range) {
+  const ::std::size_t N = input.size();
+  ::std::size_t segments = __ceiling_div(N, work_group_size * ItersPerWorkItem);
+  auto e = policy.queue().submit([&](auto &h) {
+    // Temporary use of stable non-public API from oneDPL,  this function will
+    // be replaced with oneDPL call in an upcoming release.
+    oneapi::dpl::__ranges::__require_access(h, input, bins, opt_range...);
+    sycl::local_accessor<::std::uint32_t, 1> local_histogram(
+        sycl::range(num_bins), h);
+    h.parallel_for(
+        sycl::nd_range<1>(segments * work_group_size, work_group_size),
+        [=](sycl::nd_item<1> __self_item) {
+          using __bin_idx_type = ::std::uint16_t;
+          constexpr auto __atomic_address_space =
+              sycl::access::address_space::local_space;
+          const ::std::size_t __self_lidx = __self_item.get_local_id(0);
+          const ::std::uint32_t __wgroup_idx = __self_item.get_group(0);
+          const ::std::size_t __seg_start =
+              work_group_size * __wgroup_idx * ItersPerWorkItem;
+
+          __clear_wglocal_histograms(local_histogram, 0, num_bins, __self_item);
+
+          if (__seg_start + work_group_size * ItersPerWorkItem < N) {
+            _DPCT_PRAGMA_UNROLL
+            for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) {
+              __accum_local_atomics_iter<__bin_idx_type,
+                                         __atomic_address_space>(
+                  input, __seg_start + idx * work_group_size + __self_lidx,
+                  local_histogram, 0, func);
+            }
+          } else {
+            _DPCT_PRAGMA_UNROLL
+            for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) {
+              ::std::size_t __val_idx =
+                  __seg_start + idx * work_group_size + __self_lidx;
+              if (__val_idx < N) {
+                __accum_local_atomics_iter<__bin_idx_type,
+                                           __atomic_address_space>(
+                    input, __val_idx, local_histogram, 0, func);
+              }
+            }
+          }
+          __self_item.barrier(sycl::access::fence_space::local_space);
+
+          __reduce_out_histograms<BinType>(local_histogram, 0, bins, num_bins,
+                                           __self_item);
+        });
+  });
+
+  e.wait();
+}
+
+template <::std::uint16_t __min_iters_per_work_item, typename BinType,
+          typename Policy, typename Range1, typename Range2, typename Size,
+          typename IdxHashFunc, typename... Range3>
+inline void __histogram_general_private_global_atomics(
+    Policy &&policy, ::std::uint16_t work_group_size, Range1 &&input,
+    Range2 &&bins, const Size &num_bins, IdxHashFunc func,
+    Range3 &&...opt_range) {
+
+  const ::std::size_t N = input.size();
+  auto __global_mem_size =
+      policy.queue()
+          .get_device()
+          .template get_info<sycl::info::device::global_mem_size>();
+  const ::std::size_t max_segments =
+      ::std::min(__global_mem_size / (num_bins * sizeof(BinType)),
+                 __ceiling_div(N, work_group_size * __min_iters_per_work_item));
+  const ::std::size_t iters_per_work_item =
+      __ceiling_div(N, max_segments * work_group_size);
+  ::std::size_t segments =
+      __ceiling_div(N, work_group_size * iters_per_work_item);
+
+  sycl::buffer<BinType, 1> private_histograms(
+      sycl::range<1>(segments * num_bins));
+
+  auto e = policy.queue().submit([&](auto &h) {
+    // Temporary use of stable non-public API from oneDPL,  this function will
+    // be replaced with oneDPL call in an upcoming release.
+    oneapi::dpl::__ranges::__require_access(h, input, bins, opt_range...);
+    sycl::accessor hacc_private(private_histograms, h, sycl::read_write,
+                                sycl::no_init);
+    h.parallel_for(
+        sycl::nd_range<1>(segments * work_group_size, work_group_size),
+        [=](sycl::nd_item<1> __self_item) {
+          using __bin_idx_type = ::std::uint32_t;
+          constexpr auto __atomic_address_space =
+              sycl::access::address_space::global_space;
+          const ::std::size_t __self_lidx = __self_item.get_local_id(0);
+          const ::std::size_t __wgroup_idx = __self_item.get_group(0);
+          const ::std::size_t __seg_start =
+              work_group_size * iters_per_work_item * __wgroup_idx;
+
+          __clear_wglocal_histograms(hacc_private, __wgroup_idx * num_bins,
+                                     num_bins, __self_item);
+          if (__seg_start + work_group_size * iters_per_work_item < N) {
+            for (::std::size_t idx = 0; idx < iters_per_work_item; idx++) {
+              __accum_local_atomics_iter<__bin_idx_type,
+                                         __atomic_address_space>(
+                  input, __seg_start + idx * work_group_size + __self_lidx,
+                  hacc_private, __wgroup_idx * num_bins, func);
+            }
+          } else {
+            for (::std::size_t idx = 0; idx < iters_per_work_item; idx++) {
+              ::std::size_t __val_idx =
+                  __seg_start + idx * work_group_size + __self_lidx;
+              if (__val_idx < N) {
+                __accum_local_atomics_iter<__bin_idx_type,
+                                           __atomic_address_space>(
+                    input, __val_idx, hacc_private, __wgroup_idx * num_bins,
+                    func);
+              }
+            }
+          }
+          __self_item.barrier(sycl::access::fence_space::local_space);
+
+          __reduce_out_histograms<BinType>(hacc_private,
+                                           __wgroup_idx * num_bins, bins,
+                                           num_bins, __self_item);
+        });
+  });
+  e.wait();
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Size,
+          typename IdxHashFunc, typename... Range>
+inline Iter2
+__histogram_general_select_best(Policy &&policy, Iter1 first, Iter1 last,
+                                Iter2 histogram_first, const Size &num_bins,
+                                IdxHashFunc func, Range &&...opt_range) {
+  using __histo_value_type = typename ::std::iterator_traits<Iter2>::value_type;
+  auto __local_mem_size =
+      policy.queue()
+          .get_device()
+          .template get_info<sycl::info::device::local_mem_size>();
+  constexpr ::std::uint8_t __max_registers = 16;
+
+  // Temporary use of stable non-public API from oneDPL,  this function will be
+  // replaced with oneDPL call in an upcoming release.
+  auto keep_bins = oneapi::dpl::__ranges::__get_sycl_range<
+      oneapi::dpl::__par_backend_hetero::access_mode::write, Iter2>();
+  auto bins_buf = keep_bins(histogram_first, histogram_first + num_bins);
+
+  oneapi::dpl::fill(policy, bins_buf.all_view().begin(),
+                    bins_buf.all_view().end(), __histo_value_type(0));
+  auto N = last - first;
+  if (N > 0) {
+    // Temporary use of stable non-public API from oneDPL,  this function will
+    // be replaced with oneDPL call in an upcoming release.
+    auto keep_input = oneapi::dpl::__ranges::__get_sycl_range<
+        oneapi::dpl::__par_backend_hetero::access_mode::read, Iter1>();
+    auto input_buf = keep_input(first, last);
+
+    ::std::size_t max_work_group_size =
+        policy.queue()
+            .get_device()
+            .template get_info<sycl::info::device::max_work_group_size>();
+    ::std::size_t work_group_size =
+        ::std::min(max_work_group_size, ::std::size_t(1024));
+
+    if (num_bins < __max_registers) {
+
+      // If bins fit into registers, use register private accumulation
+      __histogram_general_registers_local_reduction<32, 16, __histo_value_type>(
+          ::std::forward<Policy>(policy), work_group_size, input_buf.all_view(),
+          bins_buf.all_view(), num_bins, func,
+          ::std::forward<Range...>(opt_range)...);
+    } else if (num_bins * sizeof(__histo_value_type) < __local_mem_size) {
+      // If bins fit into SLM, use local atomics
+
+      // Experimentally determined iters per work-item
+      if (N <= 524288) {
+        __histogram_general_local_atomics<4, __histo_value_type>(
+            ::std::forward<Policy>(policy), work_group_size,
+            input_buf.all_view(), bins_buf.all_view(), num_bins, func,
+            ::std::forward<Range...>(opt_range)...);
+      } else {
+        __histogram_general_local_atomics<32, __histo_value_type>(
+            ::std::forward<Policy>(policy), work_group_size,
+            input_buf.all_view(), bins_buf.all_view(), num_bins, func,
+            ::std::forward<Range...>(opt_range)...);
+      }
+    } else // Otherwise, use global atomics (private copies per workgroup)
+    {
+      // Experimentally determined iters per work-item
+      if (N <= 524288) {
+        __histogram_general_private_global_atomics<4, __histo_value_type>(
+            ::std::forward<Policy>(policy), work_group_size,
+            input_buf.all_view(), bins_buf.all_view(), num_bins, func,
+            ::std::forward<Range...>(opt_range)...);
+      } else {
+        __histogram_general_private_global_atomics<32, __histo_value_type>(
+            ::std::forward<Policy>(policy), work_group_size,
+            input_buf.all_view(), bins_buf.all_view(), num_bins, func,
+            ::std::forward<Range...>(opt_range)...);
+      }
+    }
+  }
+  return histogram_first + num_bins;
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Size,
+          typename T>
+inline ::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+        dpct::internal::is_iterator<Iter2>::value &&
+        internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value,
+    Iter2>
+histogram(Policy &&policy, Iter1 first, Iter1 last, Iter2 histogram_first,
+          const Size &num_bins, const T &first_bin_min_val,
+          const T &last_bin_max_val) {
+  return __histogram_general_select_best(
+      ::std::forward<Policy>(policy), first, last, histogram_first, num_bins,
+      __evenly_divided_binhash<T>(first_bin_min_val, last_bin_max_val,
+                                  num_bins));
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
+inline ::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+        dpct::internal::is_iterator<Iter2>::value &&
+        dpct::internal::is_iterator<Iter3>::value &&
+        internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value,
+    Iter2>
+histogram(Policy &&policy, Iter1 first, Iter1 last, Iter2 histogram_first,
+          Iter3 boundary_first, Iter3 boundary_last) {
+  // Temporary use of stable non-public API from oneDPL,  this function will be
+  // replaced with oneDPL call in an upcoming release.
+  auto keep_boundaries = oneapi::dpl::__ranges::__get_sycl_range<
+      oneapi::dpl::__par_backend_hetero::access_mode::read, Iter3>();
+  auto boundary_buf = keep_boundaries(boundary_first, boundary_last);
+
+  return __histogram_general_select_best(
+      ::std::forward<Policy>(policy), first, last, histogram_first,
+      (boundary_last - boundary_first) - 1,
+      __custom_range_binhash{boundary_buf.all_view()}, boundary_buf.all_view());
+}
+} // end namespace dpl_histogram
+
+} // end namespace internal
+
+// Evenly Divided Histogram of a 1-D array
+template <typename Policy, typename Iter1, typename Iter2, typename T,
+          typename Size>
+::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+histogram_even(Policy &&policy, Iter1 d_samples, Iter2 d_histogram,
+               int num_levels, T lower_level, T upper_level, Size num_samples) {
+  internal::dpl_histogram::histogram(::std::forward<Policy>(policy), d_samples,
+                                     d_samples + num_samples, d_histogram,
+                                     num_levels - 1, lower_level, upper_level);
+}
+
+// Evenly Divided Histogram of a 2-D ROI in a flattened 2-D array
+template <typename Policy, typename Iter1, typename Iter2, typename T,
+          typename OffsetT>
+::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+histogram_even_roi(Policy &&policy, Iter1 d_samples, Iter2 d_histogram,
+                   int num_levels, T lower_level, T upper_level,
+                   OffsetT num_row_samples, OffsetT num_rows,
+                   ::std::size_t row_stride_bytes) {
+  return histogram_even(
+      ::std::forward<Policy>(policy),
+      oneapi::dpl::permutation_iterator(
+          d_samples,
+          internal::__roi_2d_index_functor(
+              num_row_samples,
+              row_stride_bytes /
+                  sizeof(typename ::std::iterator_traits<Iter1>::value_type))),
+      d_histogram, num_levels, lower_level, upper_level,
+      num_row_samples * num_rows);
+}
+
+// Evenly Divided Multi-Channel Histogram of a 1-D array
+template <int NumChannels, int NumActiveChannels, typename Policy,
+          typename Iter1, typename Iter2, typename T, typename Size>
+::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+multi_histogram_even(Policy &&policy, Iter1 d_samples,
+                     Iter2 d_histogram[NumActiveChannels],
+                     int num_levels[NumActiveChannels],
+                     T lower_level[NumActiveChannels],
+                     T upper_level[NumActiveChannels], Size num_pixels) {
+  for (int active_channel = 0; active_channel < NumActiveChannels;
+       active_channel++) {
+    histogram_even(
+        policy,
+        oneapi::dpl::permutation_iterator(
+            d_samples,
+            internal::__interleaved_index_functor(NumChannels, active_channel)),
+        d_histogram[active_channel], num_levels[active_channel],
+        lower_level[active_channel], upper_level[active_channel], num_pixels);
+  }
+}
+
+// Evenly Divided Multi-Channel Histogram of a 2-D ROI in a flattened 2-D array
+template <int NumChannels, int NumActiveChannels, typename Policy,
+          typename Iter1, typename Iter2, typename T, typename OffsetT>
+::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+multi_histogram_even_roi(Policy &&policy, Iter1 d_samples,
+                         Iter2 d_histogram[NumActiveChannels],
+                         int num_levels[NumActiveChannels],
+                         T lower_level[NumActiveChannels],
+                         T upper_level[NumActiveChannels],
+                         OffsetT num_row_samples, OffsetT num_rows,
+                         ::std::size_t row_stride_bytes) {
+  for (int active_channel = 0; active_channel < NumActiveChannels;
+       active_channel++) {
+    histogram_even(
+        policy,
+        oneapi::dpl::permutation_iterator(
+            d_samples,
+            internal::__composition_functor(
+                internal::__roi_2d_index_functor(
+                    num_row_samples,
+                    row_stride_bytes /
+                        (NumChannels * sizeof(typename ::std::iterator_traits<
+                                              Iter1>::value_type))),
+                internal::__interleaved_index_functor(NumChannels,
+                                                      active_channel))),
+        d_histogram[active_channel], num_levels[active_channel],
+        lower_level[active_channel], upper_level[active_channel],
+        num_row_samples * num_rows);
+  }
+}
+
+// Custom Range Histogram of a 1-D array
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Size>
+::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    dpct::internal::is_iterator<Iter3>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+histogram_range(Policy &&policy, Iter1 d_samples, Iter2 d_histogram,
+                int num_levels, Iter3 d_levels, Size num_samples) {
+  internal::dpl_histogram::histogram(::std::forward<Policy>(policy), d_samples,
+                                     d_samples + num_samples, d_histogram,
+                                     d_levels, d_levels + num_levels);
+}
+
+// Custom Range Histogram of a 2-D ROI in a flattened 2-D Array
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename OffsetT>
+::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    dpct::internal::is_iterator<Iter3>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+histogram_range_roi(Policy &&policy, Iter1 d_samples, Iter2 d_histogram,
+                    int num_levels, Iter3 d_levels, OffsetT num_row_samples,
+                    OffsetT num_rows, ::std::size_t row_stride_bytes) {
+  return histogram_range(
+      ::std::forward<Policy>(policy),
+      oneapi::dpl::permutation_iterator(
+          d_samples,
+          internal::__roi_2d_index_functor(
+              num_row_samples,
+              row_stride_bytes /
+                  sizeof(typename ::std::iterator_traits<Iter1>::value_type))),
+      d_histogram, num_levels, d_levels, num_row_samples * num_rows);
+}
+
+// Custom Range Multi-Channel Histogram of a 1-D array
+template <int NumChannels, int NumActiveChannels, typename Policy,
+          typename Iter1, typename Iter2, typename Iter3, typename Size>
+::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    dpct::internal::is_iterator<Iter3>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+multi_histogram_range(Policy &&policy, Iter1 d_samples,
+                      Iter2 d_histogram[NumActiveChannels],
+                      int num_levels[NumActiveChannels],
+                      Iter3 d_levels[NumActiveChannels], Size num_pixels) {
+  for (int active_channel = 0; active_channel < NumActiveChannels;
+       active_channel++) {
+    histogram_range(policy,
+                    oneapi::dpl::permutation_iterator(
+                        d_samples, internal::__interleaved_index_functor(
+                                       NumChannels, active_channel)),
+                    d_histogram[active_channel], num_levels[active_channel],
+                    d_levels[active_channel], num_pixels);
+  }
+}
+
+// Custom Range Multi-Channel Histogram of a 2-D ROI in a flattened 2-D array
+template <int NumChannels, int NumActiveChannels, typename Policy,
+          typename Iter1, typename Iter2, typename Iter3, typename OffsetT>
+::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    dpct::internal::is_iterator<Iter3>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+multi_histogram_range_roi(Policy &&policy, Iter1 d_samples,
+                          Iter2 d_histogram[NumActiveChannels],
+                          int num_levels[NumActiveChannels],
+                          Iter3 d_levels[NumActiveChannels],
+                          OffsetT num_row_samples, OffsetT num_rows,
+                          ::std::size_t row_stride_bytes) {
+  for (int active_channel = 0; active_channel < NumActiveChannels;
+       active_channel++) {
+    histogram_range(
+        policy,
+        oneapi::dpl::permutation_iterator(
+            d_samples,
+            internal::__composition_functor(
+                internal::__roi_2d_index_functor(
+                    num_row_samples,
+                    row_stride_bytes /
+                        (NumChannels * sizeof(typename ::std::iterator_traits<
+                                              Iter1>::value_type))),
+                internal::__interleaved_index_functor(NumChannels,
+                                                      active_channel))),
+        d_histogram[active_channel], num_levels[active_channel],
+        d_levels[active_channel], num_row_samples * num_rows);
+  }
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4>
+inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
+                          dpct::internal::is_iterator<Iter2>::value &&
+                          dpct::internal::is_iterator<Iter3>::value &&
+                          dpct::internal::is_iterator<Iter4>::value>
+sort_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
+           Iter4 values_out, ::std::int64_t n, bool descending, int begin_bit,
+           int end_bit) {
+  internal::sort_pairs_impl(std::forward<Policy>(policy), keys_in, keys_out,
+                            values_in, values_out, n, descending, begin_bit,
+                            end_bit);
+}
+
+template <typename Policy, typename Iter1, typename Iter2>
+inline void sort_pairs(
+    Policy &&policy, io_iterator_pair<Iter1> &keys,
+    io_iterator_pair<Iter2> &values, ::std::int64_t n, bool descending = false,
+    bool do_swap_iters = false, int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  sort_pairs(::std::forward<Policy>(policy), keys.first(), keys.second(),
+             values.first(), values.second(), n, descending, begin_bit,
+             end_bit);
+  if (do_swap_iters) {
+    keys.swap();
+    values.swap();
+  }
+}
+
+template <typename Policy, typename Iter1, typename Iter2>
+inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
+                          dpct::internal::is_iterator<Iter2>::value>
+sort_keys(Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
+          bool descending, int begin_bit, int end_bit) {
+  using key_t_value_t = typename ::std::iterator_traits<Iter1>::value_type;
+
+  int clipped_begin_bit = ::std::max(begin_bit, 0);
+  int clipped_end_bit =
+      ::std::min((::std::uint64_t)end_bit, sizeof(key_t_value_t) * 8);
+  int num_bytes = (clipped_end_bit - clipped_begin_bit - 1) / 8 + 1;
+
+  auto transform_and_sort_f = [&](auto x) {
+    using T = typename ::std::decay_t<decltype(x)>;
+    internal::transform_and_sort<decltype(policy), Iter1, Iter2, T>(
+        ::std::forward<Policy>(policy), keys_in, keys_out, n, descending,
+        clipped_begin_bit, clipped_end_bit);
+  };
+
+  if (clipped_end_bit - clipped_begin_bit == sizeof(key_t_value_t) * 8) {
+    internal::sort_only(::std::forward<Policy>(policy), keys_in, keys_out, n,
+                        descending);
+  } else if (num_bytes == 1) {
+    transform_and_sort_f.template operator()<uint8_t>(0);
+  } else if (num_bytes == 2) {
+    transform_and_sort_f.template operator()<uint16_t>(0);
+  } else if (num_bytes <= 4) {
+    transform_and_sort_f.template operator()<uint32_t>(0);
+  } else // if (num_bytes <= 8)
+  {
+    transform_and_sort_f.template operator()<::std::uint64_t>(0);
+  }
+}
+
+template <typename Policy, typename Iter1>
+inline void sort_keys(
+    Policy &&policy, io_iterator_pair<Iter1> &keys, ::std::int64_t n,
+    bool descending = false, bool do_swap_iters = false, int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  sort_keys(std::forward<Policy>(policy), keys.first(), keys.second(), n,
+            descending, begin_bit, end_bit);
+  if (do_swap_iters)
+    keys.swap();
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
+inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
+                          dpct::internal::is_iterator<Iter2>::value>
+segmented_sort_keys(
+    Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
+    ::std::int64_t nsegments, Iter3 begin_offsets, Iter3 end_offsets,
+    bool descending = false, int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  int compute_units =
+      policy.queue()
+          .get_device()
+          .template get_info<sycl::info::device::max_compute_units>();
+  auto sg_sizes = policy.queue()
+                      .get_device()
+                      .template get_info<sycl::info::device::sub_group_sizes>();
+  int subgroup_size = sg_sizes.empty() ? 1 : sg_sizes.back();
+  // parallel for of serial sorts when we have sufficient number of segments for
+  // load balance when number of segments is large as compared to our target
+  // compute capability
+  if (nsegments >
+      compute_units *
+          (policy.queue().get_device().is_gpu() ? subgroup_size : 1)) {
+    dpct::internal::segmented_sort_keys_by_parallel_for_of_sorts(
+        ::std::forward<Policy>(policy), keys_in, keys_out, n, nsegments,
+        begin_offsets, end_offsets, descending, begin_bit, end_bit);
+  } else
+  {
+    dpct::internal::segmented_sort_keys_by_parallel_sorts(
+        ::std::forward<Policy>(policy), keys_in, keys_out, n, nsegments,
+        begin_offsets, end_offsets, descending, begin_bit, end_bit);
+  }
+}
+
+template <typename Policy, typename Iter1, typename Iter2>
+inline void segmented_sort_keys(
+    Policy &&policy, io_iterator_pair<Iter1> &keys, ::std::int64_t n,
+    ::std::int64_t nsegments, Iter2 begin_offsets, Iter2 end_offsets,
+    bool descending = false, bool do_swap_iters = false, int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  segmented_sort_keys(::std::forward<Policy>(policy), keys.first(),
+                      keys.second(), n, nsegments, begin_offsets, end_offsets,
+                      descending, begin_bit, end_bit);
+  if (do_swap_iters) {
+    keys.swap();
+  }
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
+          typename Iter4, typename Iter5>
+inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
+                          dpct::internal::is_iterator<Iter2>::value &&
+                          dpct::internal::is_iterator<Iter3>::value &&
+                          dpct::internal::is_iterator<Iter4>::value>
+segmented_sort_pairs(
+    Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
+    Iter4 values_out, ::std::int64_t n, ::std::int64_t nsegments,
+    Iter5 begin_offsets, Iter5 end_offsets, bool descending = false,
+    int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  int compute_units =
+      policy.queue()
+          .get_device()
+          .template get_info<sycl::info::device::max_compute_units>();
+  auto sg_sizes = policy.queue()
+                      .get_device()
+                      .template get_info<sycl::info::device::sub_group_sizes>();
+  int subgroup_size = sg_sizes.empty() ? 1 : sg_sizes.back();
+  // parallel for of serial sorts when we have sufficient number of segments for
+  // load balance when number of segments is large as compared to our target
+  // compute capability
+  if (nsegments >
+      compute_units *
+          (policy.queue().get_device().is_gpu() ? subgroup_size : 1)) {
+    dpct::internal::segmented_sort_pairs_by_parallel_for_of_sorts(
+        ::std::forward<Policy>(policy), keys_in, keys_out, values_in,
+        values_out, n, nsegments, begin_offsets, end_offsets, descending,
+        begin_bit, end_bit);
+  } else
+  {
+    dpct::internal::segmented_sort_pairs_by_parallel_sorts(
+        ::std::forward<Policy>(policy), keys_in, keys_out, values_in,
+        values_out, n, nsegments, begin_offsets, end_offsets, descending,
+        begin_bit, end_bit);
+  }
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
+inline void segmented_sort_pairs(
+    Policy &&policy, io_iterator_pair<Iter1> &keys,
+    io_iterator_pair<Iter2> &values, ::std::int64_t n, ::std::int64_t nsegments,
+    Iter3 begin_offsets, Iter3 end_offsets, bool descending = false,
+    bool do_swap_iters = false, int begin_bit = 0,
+    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
+                  8) {
+  segmented_sort_pairs(std::forward<Policy>(policy), keys.first(),
+                       keys.second(), values.first(), values.second(), n,
+                       nsegments, begin_offsets, end_offsets, descending,
+                       begin_bit, end_bit);
+  if (do_swap_iters) {
+    keys.swap();
+    values.swap();
+  }
+}
+
+template <typename Policy, typename Iter1, typename Iter2>
+inline void reduce_argmax(Policy &&policy, Iter1 input, Iter2 output,
+                          ::std::size_t n) {
+  dpct::arg_index_input_iterator<decltype(input), int> input_arg_idx(input);
+  auto ret = ::std::max_element(
+      policy, input_arg_idx, input_arg_idx + n,
+      [](const auto &a, const auto &b) { return (a.value < b.value); });
+  ::std::copy(::std::forward<Policy>(policy), ret, ret + 1, output);
+}
+
+template <typename Policy, typename Iter1, typename Iter2>
+inline void reduce_argmin(Policy &&policy, Iter1 input, Iter2 output,
+                          ::std::size_t n) {
+  dpct::arg_index_input_iterator<decltype(input), int> input_arg_idx(input);
+  auto ret = ::std::min_element(
+      policy, input_arg_idx, input_arg_idx + n,
+      [](const auto &a, const auto &b) { return (a.value < b.value); });
+  ::std::copy(::std::forward<Policy>(policy), ret, ret + 1, output);
+}
+
+template <typename Policy, typename Iter1, typename ValueT, typename CompT>
+inline ::std::pair<Iter1, Iter1> equal_range(Policy &&policy, Iter1 start,
+                                             Iter1 end, const ValueT &value,
+                                             CompT comp) {
+  ::std::vector<::std::int64_t> res_lower(1);
+  ::std::vector<::std::int64_t> res_upper(1);
+  ::std::vector<ValueT> value_vec(1, value);
+  ::oneapi::dpl::lower_bound(policy, start, end, value_vec.begin(),
+                             value_vec.end(), res_lower.begin(), comp);
+  ::oneapi::dpl::upper_bound(::std::forward<Policy>(policy), start, end,
+                             value_vec.begin(), value_vec.end(),
+                             res_upper.begin(), comp);
+  return ::std::make_pair(start + res_lower[0], start + res_upper[0]);
+}
+
+template <typename Policy, typename Iter1, typename ValueT>
+inline ::std::pair<Iter1, Iter1> equal_range(Policy &&policy, Iter1 start,
+                                             Iter1 end, const ValueT &value) {
+  return equal_range(::std::forward<Policy>(policy), start, end, value,
+                     internal::__less());
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
+inline ::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+segmented_reduce_argmin(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
+                        ::std::int64_t nsegments, Iter3 begin_offsets,
+                        Iter3 end_offsets) {
+  policy.queue().submit([&](sycl::handler &cgh) {
+    cgh.parallel_for(nsegments, [=](sycl::id<1> i) {
+      if (end_offsets[i] <= begin_offsets[i]) {
+        keys_out[i] = dpct::key_value_pair(
+            1, ::std::numeric_limits<
+                   typename ::std::iterator_traits<Iter1>::value_type>::max());
+      } else {
+        dpct::arg_index_input_iterator<Iter1, int> arg_index(keys_in +
+                                                             begin_offsets[i]);
+        keys_out[i] = *::std::min_element(
+            arg_index, arg_index + (end_offsets[i] - begin_offsets[i]),
+            [](const auto &a, const auto &b) { return a.value < b.value; });
+      }
+    });
+  });
+  policy.queue().wait();
+}
+
+template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
+inline ::std::enable_if_t<
+    dpct::internal::is_iterator<Iter1>::value &&
+    dpct::internal::is_iterator<Iter2>::value &&
+    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
+segmented_reduce_argmax(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
+                        ::std::int64_t nsegments, Iter3 begin_offsets,
+                        Iter3 end_offsets) {
+  policy.queue().submit([&](sycl::handler &cgh) {
+    cgh.parallel_for(nsegments, [=](sycl::id<1> i) {
+      if (end_offsets[i] <= begin_offsets[i]) {
+        keys_out[i] = dpct::key_value_pair(
+            1,
+            ::std::numeric_limits<
+                typename ::std::iterator_traits<Iter1>::value_type>::lowest());
+      } else {
+        dpct::arg_index_input_iterator<Iter1, int> arg_index(keys_in +
+                                                             begin_offsets[i]);
+        keys_out[i] = *::std::max_element(
+            arg_index, arg_index + (end_offsets[i] - begin_offsets[i]),
+            [](const auto &a, const auto &b) { return a.value < b.value; });
+      }
+    });
+  });
+  policy.queue().wait();
+}
+
+template <typename ExecutionPolicy, typename InputIterator,
+          typename OutputIterator1, typename OutputIterator2,
+          typename OutputIterator3>
+void nontrivial_run_length_encode(ExecutionPolicy &&policy,
+                                  InputIterator input_beg,
+                                  OutputIterator1 offsets_out,
+                                  OutputIterator2 lengths_out,
+                                  OutputIterator3 num_runs,
+                                  ::std::int64_t num_items) {
+  using oneapi::dpl::make_transform_iterator;
+  using oneapi::dpl::make_zip_iterator;
+  using offsets_t =
+      typename ::std::iterator_traits<OutputIterator1>::value_type;
+  using lengths_t =
+      typename ::std::iterator_traits<OutputIterator2>::value_type;
+
+  auto input_end = input_beg + num_items;
+  // First element must be nontrivial run (start of first segment)
+  auto first_adj_it = oneapi::dpl::adjacent_find(policy, input_beg, input_end);
+  auto first_adj_idx = ::std::distance(input_beg, first_adj_it);
+  if (first_adj_it == input_end) {
+    ::std::fill(policy, num_runs, num_runs + 1, 0);
+    return;
+  }
+  auto get_prev_idx_element = [first_adj_idx](const auto &idx) {
+    auto out_idx = idx + first_adj_idx;
+    return (out_idx == 0) ? 0 : out_idx - 1;
+  };
+  auto get_next_idx_element = [first_adj_idx, num_items](const auto &idx) {
+    auto out_idx = idx + first_adj_idx;
+    return (out_idx == num_items - 1) ? num_items - 1 : out_idx + 1;
+  };
+  // TODO: Use shifted view to pad range once oneDPL ranges is non-experimental
+  auto left_shifted_input_beg =
+      oneapi::dpl::make_permutation_iterator(input_beg, get_prev_idx_element);
+  auto right_shifted_input_beg =
+      oneapi::dpl::make_permutation_iterator(input_beg, get_next_idx_element);
+  // Segment type for ith idx consists of zip of iterators at (i-1, i, i+1)
+  // padded at the ends
+  auto zipped_keys_beg = make_zip_iterator(
+      left_shifted_input_beg, input_beg, right_shifted_input_beg,
+      oneapi::dpl::counting_iterator<offsets_t>(0));
+  // Set flag at the beginning of new nontrivial run (ex: (2, 3, 3) -> 1)
+  auto key_flags_beg =
+      make_transform_iterator(zipped_keys_beg, [num_items](const auto &zipped) {
+        using ::std::get;
+        bool last_idx_mask = get<3>(zipped) != num_items - 1;
+        return (get<0>(zipped) != get<1>(zipped) &&
+                get<1>(zipped) == get<2>(zipped)) &&
+               last_idx_mask;
+      });
+  auto count_beg = oneapi::dpl::counting_iterator<offsets_t>(0);
+  auto const_it = dpct::make_constant_iterator(lengths_t(1));
+  // Check for presence of nontrivial element at current index
+  auto tr_nontrivial_flags = make_transform_iterator(
+      make_zip_iterator(left_shifted_input_beg, input_beg),
+      [](const auto &zip) {
+        using ::std::get;
+        return get<0>(zip) == get<1>(zip);
+      });
+  auto zipped_vals_beg =
+      make_zip_iterator(tr_nontrivial_flags, count_beg, const_it);
+  auto pred = [](bool lhs, bool rhs) { return !rhs; };
+  auto op = [](auto lhs, const auto &rhs) {
+    using ::std::get;
+
+    // Update length count of run.
+    // The first call to this op will use the first element of the input as lhs
+    // and second element as rhs. get<0>(first_element) is ignored in favor of a
+    // constant `1` in get<2>, avoiding the need for special casing the first
+    // element. The constant `1` utilizes the knowledge that each segment begins
+    // with a nontrivial run.
+    get<2>(lhs) += get<0>(rhs);
+
+    // A run's starting index is stored in get<1>(lhs) as the initial value in
+    // the segment and is preserved throughout the segment's reduction as the
+    // nontrivial run's offset.
+
+    return ::std::move(lhs);
+  };
+  auto zipped_out_beg = make_zip_iterator(oneapi::dpl::discard_iterator(),
+                                          offsets_out, lengths_out);
+  auto [_, zipped_out_vals_end] = oneapi::dpl::reduce_by_segment(
+      policy, key_flags_beg + first_adj_idx, key_flags_beg + num_items,
+      zipped_vals_beg + first_adj_idx, oneapi::dpl::discard_iterator(),
+      zipped_out_beg, pred, op);
+  auto ret_dist = ::std::distance(zipped_out_beg, zipped_out_vals_end);
+  ::std::fill(policy, num_runs, num_runs + 1, ret_dist);
+}
+
+} // end namespace dpct
+
+#endif
diff --git a/dpct/dpl_extras/dpcpp_extensions.h b/dpct/dpl_extras/dpcpp_extensions.h
new file mode 100644
index 0000000000000..05a0068e65925
--- /dev/null
+++ b/dpct/dpl_extras/dpcpp_extensions.h
@@ -0,0 +1,747 @@
+//==---- dpcpp_extensions.h ------------------*- C++ -*---------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------===//
+
+#ifndef __DPCT_DPCPP_EXTENSIONS_H__
+#define __DPCT_DPCPP_EXTENSIONS_H__
+
+#include <sycl/sycl.hpp>
+#include <stdexcept>
+
+#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
+#include <sycl/ext/oneapi/experimental/user_defined_reductions.hpp>
+#endif
+
+#include "../dpct.hpp"
+#include "functional.h"
+
+namespace dpct {
+namespace group {
+namespace detail {
+
+template <typename... _Args>
+constexpr auto __reduce_over_group(_Args... __args) {
+  return sycl::reduce_over_group(__args...);
+}
+
+template <typename... _Args> constexpr auto __group_broadcast(_Args... __args) {
+  return sycl::group_broadcast(__args...);
+}
+
+template <typename... _Args>
+constexpr auto __exclusive_scan_over_group(_Args... __args) {
+  return sycl::exclusive_scan_over_group(__args...);
+}
+
+template <typename... _Args>
+constexpr auto __inclusive_scan_over_group(_Args... __args) {
+  return sycl::inclusive_scan_over_group(__args...);
+}
+
+} // end namespace detail
+
+/// Perform an exclusive scan over the values of inputs from all work-items in
+/// the group using the operator binary_op, which must be one of the SYCL 2020
+/// group algorithms library function objects.
+///
+/// \param item A work-item in a group.
+/// \param inputs Pointer to the input data for the scan operation.
+/// \param outputs Pointer to the location where scan results will be stored.
+/// \param init initial value of the scan result.
+/// \param binary_op functor that implements the binary operation used to
+/// perform the scan.
+template <typename Item, typename T, class BinaryOperation,
+          int VALUES_PER_THREAD>
+__dpct_inline__ void
+exclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
+               T (&outputs)[VALUES_PER_THREAD], T init,
+               BinaryOperation binary_op) {
+  T result = inputs[0];
+
+#pragma unroll
+  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
+    result = binary_op(result, inputs[i]);
+  }
+
+  T exclusive_result =
+      detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
+
+  T input = inputs[0];
+  if (item.get_local_linear_id() == 0) {
+    outputs[0] = init;
+  } else {
+    outputs[0] = exclusive_result;
+  }
+
+#pragma unroll
+  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
+    T output = binary_op(input, outputs[i - 1]);
+    input = inputs[i];
+    outputs[i] = output;
+  }
+}
+
+/// Perform an exclusive scan over the values of input from all work-items in
+/// the group using the operator binary_op, which must be one of the SYCL 2020
+/// group algorithms library function objects.
+///
+/// \param item A work-item in a group.
+/// \param input Input data for the scan operation.
+/// \param init initial value of the scan result.
+/// \param binary_op functor that implements the binary operation used to
+/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
+/// in the work-items of the group. \returns exclusive scan of the first i
+/// work-items where item is the i-th work item.
+template <typename Item, typename T, class BinaryOperation>
+__dpct_inline__ T
+exclusive_scan(const Item &item, T input, T init, BinaryOperation binary_op,
+               T &group_aggregate) {
+  T output = detail::__exclusive_scan_over_group(item.get_group(), input, init,
+                                                 binary_op);
+  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
+    group_aggregate = binary_op(output, input);
+  }
+
+  group_aggregate = detail::__group_broadcast(
+      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
+  return output;
+}
+
+/// Perform an exclusive scan over the values of input from all work-items in
+/// the group using the operator binary_op, which must be one of the SYCL 2020
+/// group algorithms library function objects.
+///
+/// \param item A work-item in a group.
+/// \param input Input data for the scan operation.
+/// \param binary_op functor that implements the binary operation used to
+/// perform the scan. \param prefix_callback_op functor invoked by the first
+/// work-item in the group that returns the
+///        initial value in the resulting scan of the work-items in the group.
+/// \returns exclusive scan of the input elements assigned to work-items in the
+/// group.
+template <typename Item, typename T, class BinaryOperation,
+          class GroupPrefixCallbackOperation>
+__dpct_inline__ T
+exclusive_scan(const Item &item, T input, BinaryOperation binary_op,
+               GroupPrefixCallbackOperation &prefix_callback_op) {
+  T group_aggregate;
+
+  T output =
+      detail::__exclusive_scan_over_group(item.get_group(), input, binary_op);
+  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
+    group_aggregate = binary_op(output, input);
+  }
+
+  group_aggregate = detail::__group_broadcast(
+      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
+
+  T group_prefix = prefix_callback_op(group_aggregate);
+  if (item.get_local_linear_id() == 0) {
+    output = group_prefix;
+  } else {
+    output = binary_op(group_prefix, output);
+  }
+
+  return output;
+}
+
+namespace detail {
+
+typedef uint16_t digit_counter_type;
+typedef uint32_t packed_counter_type;
+
+template <int N, int CURRENT_VAL = N, int COUNT = 0> struct log2 {
+  enum { VALUE = log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
+};
+
+template <int N, int COUNT> struct log2<N, 0, COUNT> {
+  enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
+};
+
+template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {
+public:
+  static size_t get_local_memory_size(size_t group_threads) {
+    return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type);
+  }
+
+  radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  template <typename Item, int VALUES_PER_THREAD>
+  __dpct_inline__ void
+  rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
+            int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {
+
+    digit_counter_type thread_prefixes[VALUES_PER_THREAD];
+    digit_counter_type *digit_counters[VALUES_PER_THREAD];
+    digit_counter_type *buffer =
+        reinterpret_cast<digit_counter_type *>(_local_memory);
+
+    reset_local_memory(item);
+
+    item.barrier(sycl::access::fence_space::local_space);
+
+#pragma unroll
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits);
+      uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
+      uint32_t counter_lane = digit & (COUNTER_LANES - 1);
+
+      if (DESCENDING) {
+        sub_counter = PACKING_RATIO - 1 - sub_counter;
+        counter_lane = COUNTER_LANES - 1 - counter_lane;
+      }
+
+      digit_counters[i] =
+          &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO +
+                  item.get_local_linear_id() * PACKING_RATIO + sub_counter];
+      thread_prefixes[i] = *digit_counters[i];
+      *digit_counters[i] = thread_prefixes[i] + 1;
+    }
+
+    item.barrier(sycl::access::fence_space::local_space);
+
+    scan_counters(item);
+
+    item.barrier(sycl::access::fence_space::local_space);
+
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      ranks[i] = thread_prefixes[i] + *digit_counters[i];
+    }
+  }
+
+private:
+  template <typename Item>
+  __dpct_inline__ void reset_local_memory(const Item &item) {
+    packed_counter_type *ptr =
+        reinterpret_cast<packed_counter_type *>(_local_memory);
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0;
+    }
+  }
+
+  template <typename Item>
+  __dpct_inline__ packed_counter_type upsweep(const Item &item) {
+    packed_counter_type sum = 0;
+    packed_counter_type *ptr =
+        reinterpret_cast<packed_counter_type *>(_local_memory);
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; i++) {
+      cached_segment[i] =
+          ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i];
+    }
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      sum += cached_segment[i];
+    }
+
+    return sum;
+  }
+
+  template <typename Item>
+  __dpct_inline__ void
+  exclusive_downsweep(const Item &item, packed_counter_type raking_partial) {
+    packed_counter_type *ptr =
+        reinterpret_cast<packed_counter_type *>(_local_memory);
+    packed_counter_type sum = raking_partial;
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      packed_counter_type value = cached_segment[i];
+      cached_segment[i] = sum;
+      sum += value;
+    }
+
+#pragma unroll
+    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
+      ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] =
+          cached_segment[i];
+    }
+  }
+
+  struct prefix_callback {
+    __dpct_inline__ packed_counter_type
+    operator()(packed_counter_type block_aggregate) {
+      packed_counter_type block_prefix = 0;
+
+#pragma unroll
+      for (int packed = 1; packed < PACKING_RATIO; packed++) {
+        block_prefix += block_aggregate
+                        << (sizeof(digit_counter_type) * 8 * packed);
+      }
+
+      return block_prefix;
+    }
+  };
+
+  template <typename Item>
+  __dpct_inline__ void scan_counters(const Item &item) {
+    packed_counter_type raking_partial = upsweep(item);
+
+    prefix_callback callback;
+    packed_counter_type exclusive_partial = exclusive_scan(
+        item, raking_partial, sycl::ext::oneapi::plus<packed_counter_type>(),
+        callback);
+
+    exclusive_downsweep(item, exclusive_partial);
+  }
+
+private:
+  static constexpr int PACKING_RATIO =
+      sizeof(packed_counter_type) / sizeof(digit_counter_type);
+  static constexpr int LOG_PACKING_RATIO = log2<PACKING_RATIO>::VALUE;
+  static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO;
+  static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES;
+  static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1;
+
+  packed_counter_type cached_segment[PADDED_COUNTER_LANES];
+  uint8_t *_local_memory;
+};
+
+template <typename T, typename U> struct base_traits {
+
+  static __dpct_inline__ U twiddle_in(U key) {
+    throw std::runtime_error("Not implemented");
+  }
+  static __dpct_inline__ U twiddle_out(U key) {
+    throw std::runtime_error("Not implemented");
+  }
+};
+
+template <typename U> struct base_traits<uint32_t, U> {
+  static __dpct_inline__ U twiddle_in(U key) { return key; }
+  static __dpct_inline__ U twiddle_out(U key) { return key; }
+};
+
+template <typename U> struct base_traits<int, U> {
+  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
+  static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; }
+  static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; }
+};
+
+template <typename U> struct base_traits<float, U> {
+  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
+  static __dpct_inline__ U twiddle_in(U key) {
+    U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
+    return key ^ mask;
+  }
+  static __dpct_inline__ U twiddle_out(U key) {
+    U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
+    return key ^ mask;
+  }
+};
+
+template <typename T> struct traits : base_traits<T, T> {};
+template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
+template <> struct traits<int> : base_traits<int, uint32_t> {};
+template <> struct traits<float> : base_traits<float, uint32_t> {};
+
+} // namespace detail
+
+namespace detail {
+
+template <int N> struct power_of_two {
+  enum { VALUE = ((N & (N - 1)) == 0) };
+};
+
+__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) {
+  return (x >> shift) + addend;
+}
+
+} // namespace detail
+
+/// Implements scatter to blocked exchange pattern used in radix sort algorithm.
+///
+/// \tparam T type of the data elements exchanges
+/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
+template <typename T, int VALUES_PER_THREAD> class exchange {
+public:
+  static size_t get_local_memory_size(size_t group_threads) {
+    size_t padding_values =
+        (INSERT_PADDING)
+            ? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS)
+            : 0;
+    return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T);
+  }
+
+  exchange(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  /// Rearrange elements from rank order to blocked order
+  template <typename Item>
+  __dpct_inline__ void
+  scatter_to_blocked(Item item, T (&keys)[VALUES_PER_THREAD],
+                     int (&ranks)[VALUES_PER_THREAD]) {
+    T *buffer = reinterpret_cast<T *>(_local_memory);
+
+#pragma unroll
+    for (int i = 0; i < VALUES_PER_THREAD; i++) {
+      int offset = ranks[i];
+      if (INSERT_PADDING)
+        offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
+      buffer[offset] = keys[i];
+    }
+
+    item.barrier(sycl::access::fence_space::local_space);
+
+#pragma unroll
+    for (int i = 0; i < VALUES_PER_THREAD; i++) {
+      int offset = (item.get_local_id(0) * VALUES_PER_THREAD) + i;
+      if (INSERT_PADDING)
+        offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
+      keys[i] = buffer[offset];
+    }
+  }
+
+private:
+  static constexpr int LOG_LOCAL_MEMORY_BANKS = 5;
+  static constexpr bool INSERT_PADDING =
+      (VALUES_PER_THREAD > 4) &&
+      (detail::power_of_two<VALUES_PER_THREAD>::VALUE);
+
+  uint8_t *_local_memory;
+};
+
+/// Implements radix sort to sort integer data elements assigned to all threads
+/// in the group.
+///
+/// \tparam T type of the data elements exchanges
+/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
+/// \tparam DECENDING boolean value indicating if data elements are sorted in
+/// decending order.
+template <typename T, int VALUES_PER_THREAD, bool DESCENDING = false>
+class radix_sort {
+public:
+  static size_t get_local_memory_size(size_t group_threads) {
+    size_t ranks_size =
+        detail::radix_rank<RADIX_BITS>::get_local_memory_size(group_threads);
+    size_t exchange_size =
+        exchange<T, VALUES_PER_THREAD>::get_local_memory_size(group_threads);
+    return sycl::max(ranks_size, exchange_size);
+  }
+
+  radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {}
+
+  template <typename Item>
+  __dpct_inline__ void
+  sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0,
+       int end_bit = 8 * sizeof(T)) {
+
+    uint32_t(&unsigned_keys)[VALUES_PER_THREAD] =
+        reinterpret_cast<uint32_t(&)[VALUES_PER_THREAD]>(keys);
+
+#pragma unroll
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      unsigned_keys[i] = detail::traits<T>::twiddle_in(unsigned_keys[i]);
+    }
+
+    while (true) {
+      int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit);
+
+      int ranks[VALUES_PER_THREAD];
+      detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
+          .template rank_keys(item, unsigned_keys, ranks, begin_bit, pass_bits);
+      begin_bit += RADIX_BITS;
+
+      item.barrier(sycl::access::fence_space::local_space);
+
+      exchange<T, VALUES_PER_THREAD>(_local_memory)
+          .scatter_to_blocked(item, keys, ranks);
+
+      item.barrier(sycl::access::fence_space::local_space);
+
+      if (begin_bit >= end_bit)
+        break;
+    }
+
+#pragma unroll
+    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
+      unsigned_keys[i] = detail::traits<T>::twiddle_out(unsigned_keys[i]);
+    }
+  }
+
+private:
+  static constexpr int RADIX_BITS = 4;
+
+  uint8_t *_local_memory;
+};
+
+/// Perform a reduction of the data elements assigned to all threads in the
+/// group.
+///
+/// \param item A work-item in a group.
+/// \param inputs Pointer to the input data for the reduce operation.
+/// \param binary_op functor that implements the binary operation used to
+/// perform the scan. \returns value of the reduction using binary_op
+template <typename Item, typename T, class BinaryOperation,
+          int VALUES_PER_THREAD>
+__dpct_inline__ T
+reduce(Item item, T (&inputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
+  T result = inputs[0];
+
+#pragma unroll
+  for (int i = 1; i < VALUES_PER_THREAD; i++) {
+    result = binary_op(result, inputs[i]);
+  }
+  return detail::__reduce_over_group(item.get_group(), result, binary_op);
+}
+
+/// Perform a reduction on a limited number of the work items in a subgroup
+///
+/// \param item A work-item in a group.
+/// \param value value per work item which is to be reduced
+/// \param items_to_reduce num work items at the start of the subgroup to reduce
+/// \param binary_op functor that implements the binary operation used to
+/// perform the scan. \returns value of the reduction using binary_op
+template <typename Item, typename T, class BinaryOperation>
+__dpct_inline__
+typename ::std::enable_if_t<sycl::has_known_identity_v<BinaryOperation, T>, T>
+reduce_over_partial_group(const Item &item, const T &value,
+                          const ::std::uint16_t &items_to_reduce,
+                          BinaryOperation binary_op) {
+  T value_temp = (item.get_local_linear_id() < items_to_reduce)
+                     ? value
+                     : sycl::known_identity_v<BinaryOperation, T>;
+  return detail::__reduce_over_group(item.get_sub_group(), value_temp,
+                                     binary_op);
+}
+
+/// Perform an inclusive scan over the values of inputs from all work-items in
+/// the group using the operator binary_op, which must be one of the SYCL 2020
+/// group algorithms library function objects.
+///
+/// \param item A work-item in a group.
+/// \param inputs Pointer to the input data for the scan operation.
+/// \param outputs Pointer to the location where scan results will be stored.
+/// \param binary_op functor that implements the binary operation used to
+/// perform the scan. \returns inclusive scan of the input elements assigned to
+/// work-items in the group.
+template <typename Item, typename T, class BinaryOperation,
+          int VALUES_PER_THREAD>
+__dpct_inline__ void
+inclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
+               T (&outputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
+  T result = inputs[0];
+
+#pragma unroll
+  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
+    result = binary_op(result, inputs[i]);
+  }
+
+  T exclusive_result =
+      detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
+
+  if (item.get_local_linear_id() == 0) {
+    outputs[0] = inputs[0];
+  } else {
+    outputs[0] = binary_op(inputs[0], exclusive_result);
+  }
+
+#pragma unroll
+  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
+    outputs[i] = binary_op(inputs[i], outputs[i - 1]);
+  }
+}
+
+/// Perform an inclusive scan over the values of inputs from all work-items in
+/// the group using the operator binary_op, which must be one of the SYCL 2020
+/// group algorithms library function objects.
+///
+/// \param item A work-item in a group.
+/// \param input Pointer to the input data for the scan operation.
+/// \param binary_op functor that implements the binary operation used to
+/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
+/// in the work-items of the group. \returns inclusive scan of the input
+/// elements assigned to work-items in the group.
+template <typename Item, typename T, class BinaryOperation>
+__dpct_inline__ T inclusive_scan(const Item &item, T input,
+                                                BinaryOperation binary_op,
+                                                T &group_aggregate) {
+  T output =
+      detail::__inclusive_scan_over_group(item.get_group(), input, binary_op);
+  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
+    group_aggregate = output;
+  }
+
+  group_aggregate = detail::__group_broadcast(
+      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
+  return output;
+}
+
+/// Perform an inclusive scan over the values of input from all work-items in
+/// the group using the operator binary_op, which must be one of the SYCL 2020
+/// group algorithms library function objects.
+///
+/// \param item A work-item in a group.
+/// \param input Input data for the scan operation.
+/// \param binary_op functor that implements the binary operation used to
+/// perform the scan. \param prefix_callback_op functor invoked by the first
+/// work-item in the group that returns the
+///        initial value in the resulting scan of the work-items in the group.
+/// \returns inclusive scan of the input elements assigned to work-items in the
+/// group.
+template <typename Item, typename T, class BinaryOperation,
+          class GroupPrefixCallbackOperation>
+__dpct_inline__ T
+inclusive_scan(const Item &item, T input, BinaryOperation binary_op,
+               GroupPrefixCallbackOperation &prefix_callback_op) {
+  T group_aggregate;
+
+  T output = inclusive_scan(item, input, binary_op, group_aggregate);
+  T group_prefix = prefix_callback_op(group_aggregate);
+
+  return binary_op(group_prefix, output);
+}
+
+} // namespace group
+
+namespace device {
+
+namespace detail {
+
+template <typename... _Args> constexpr auto __joint_reduce(_Args... __args) {
+  return sycl::joint_reduce(__args...);
+}
+
+} // namespace detail
+
+/// Perform a reduce on each of the segments specified within data stored on
+/// the device.
+///
+/// \param queue Command queue used to access device used for reduction
+/// \param inputs Pointer to the data elements on the device to be reduced
+/// \param outputs Pointer to the storage where the reduced value for each
+/// segment will be stored \param segment_count number of segments to be reduced
+/// \param begin_offsets Pointer to the set of indices that are the first
+/// element in each segment \param end_offsets Pointer to the set of indices
+/// that are one past the last element in each segment \param binary_op functor
+/// that implements the binary operation used to perform the scan. \param init
+/// initial value of the reduction for each segment.
+template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
+void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
+                      size_t segment_count, OffsetT *begin_offsets,
+                      OffsetT *end_offsets, BinaryOperation binary_op, T init) {
+
+  sycl::range<1> global_size(segment_count * GROUP_SIZE);
+  sycl::range<1> local_size(GROUP_SIZE);
+
+  queue.submit([&](sycl::handler &cgh) {
+    cgh.parallel_for(
+        sycl::nd_range<1>(global_size, local_size), [=](sycl::nd_item<1> item) {
+          OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
+          OffsetT segment_end = end_offsets[item.get_group_linear_id()];
+          if (segment_begin == segment_end) {
+            if (item.get_local_linear_id() == 0) {
+              outputs[item.get_group_linear_id()] = init;
+            }
+            return;
+          }
+
+          sycl::multi_ptr<T, sycl::access::address_space::global_space>
+              input_ptr = inputs;
+          T group_aggregate = detail::__joint_reduce(
+              item.get_group(), input_ptr + segment_begin,
+              input_ptr + segment_end, init, binary_op);
+
+          if (item.get_local_linear_id() == 0) {
+            outputs[item.get_group_linear_id()] = group_aggregate;
+          }
+        });
+  });
+}
+
+
+#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
+
+namespace experimental {
+namespace detail {
+template <typename _Tp, typename... _Ts> struct __is_any {
+  constexpr static bool value = std::disjunction_v<
+      std::is_same<std::remove_cv_t<_Tp>, std::remove_cv_t<_Ts>>...>;
+};
+
+template <typename _Tp, typename _Bp> struct __in_native_op_list {
+  constexpr static bool value =
+      __is_any<_Bp, sycl::plus<_Tp>, sycl::bit_or<_Tp>, sycl::bit_xor<_Tp>,
+               sycl::bit_and<_Tp>, sycl::maximum<_Tp>, sycl::minimum<_Tp>,
+               sycl::multiplies<_Tp>>::value;
+};
+
+template <typename _Tp, typename _Bp> struct __is_native_op {
+  constexpr static bool value = __in_native_op_list<_Tp, _Bp>::value ||
+                                __in_native_op_list<void, _Bp>::value;
+};
+
+} // namespace detail
+
+/// Perform a reduce on each of the segments specified within data stored on
+/// the device. Compared with dpct::device::segmented_reduce, this experimental
+/// feature support user define reductions.
+///
+/// \param queue Command queue used to access device used for reduction
+/// \param inputs Pointer to the data elements on the device to be reduced
+/// \param outputs Pointer to the storage where the reduced value for each
+/// segment will be stored \param segment_count number of segments to be reduced
+/// \param begin_offsets Pointer to the set of indices that are the first
+/// element in each segment \param end_offsets Pointer to the set of indices
+/// that are one past the last element in each segment \param binary_op functor
+/// that implements the binary operation used to perform the scan. \param init
+/// initial value of the reduction for each segment.
+template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
+void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
+                      size_t segment_count, OffsetT *begin_offsets,
+                      OffsetT *end_offsets, BinaryOperation binary_op, T init) {
+
+  sycl::range<1> global_size(segment_count * GROUP_SIZE);
+  sycl::range<1> local_size(GROUP_SIZE);
+
+  if constexpr (!detail::__is_native_op<T, BinaryOperation>::value) {
+    queue.submit([&](sycl::handler &cgh) {
+      size_t temp_memory_size = GROUP_SIZE * sizeof(T);
+      auto scratch = sycl::local_accessor<std::byte, 1>(temp_memory_size, cgh);
+      cgh.parallel_for(
+          sycl::nd_range<1>(global_size, local_size),
+          [=](sycl::nd_item<1> item) {
+            OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
+            OffsetT segment_end = end_offsets[item.get_group_linear_id()];
+            if (segment_begin == segment_end) {
+              if (item.get_local_linear_id() == 0) {
+                outputs[item.get_group_linear_id()] = init;
+              }
+              return;
+            }
+            // Create a handle that associates the group with an allocation it
+            // can use
+            auto handle =
+                sycl::ext::oneapi::experimental::group_with_scratchpad(
+                    item.get_group(),
+                    sycl::span(&scratch[0], temp_memory_size));
+            T group_aggregate = sycl::ext::oneapi::experimental::joint_reduce(
+                handle, inputs + segment_begin, inputs + segment_end, init,
+                binary_op);
+            if (item.get_local_linear_id() == 0) {
+              outputs[item.get_group_linear_id()] = group_aggregate;
+            }
+          });
+    });
+  } else {
+    dpct::device::segmented_reduce<GROUP_SIZE>(queue, inputs, outputs,
+                                               segment_count, begin_offsets,
+                                               end_offsets, binary_op, init);
+  }
+}
+} // namespace experimental
+
+#endif // SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
+
+
+} // namespace device
+} // namespace dpct
+
+#endif
diff --git a/dpct/dpl_extras/functional.h b/dpct/dpl_extras/functional.h
new file mode 100644
index 0000000000000..bab82814c2103
--- /dev/null
+++ b/dpct/dpl_extras/functional.h
@@ -0,0 +1,453 @@
+//==---- functional.h -----------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_FUNCTIONAL_H__
+#define __DPCT_FUNCTIONAL_H__
+
+#include <functional>
+#include <oneapi/dpl/functional>
+#include <oneapi/dpl/iterator>
+
+#if ONEDPL_USE_DPCPP_BACKEND
+#include <oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h>
+#endif
+
+#include <tuple>
+#include <utility>
+
+#include "../dpct.hpp"
+#define _DPCT_GCC_VERSION                                                      \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+
+// Portability "#pragma" definition
+#ifdef _MSC_VER
+#define _DPCT_PRAGMA(x) __pragma(x)
+#else
+#define _DPCT_PRAGMA(x) _Pragma(#x)
+#endif
+
+// Enable loop unrolling pragmas where supported
+#if (__INTEL_COMPILER ||                                                       \
+     (!defined(__INTEL_COMPILER) && _DPCT_GCC_VERSION >= 80000))
+#define _DPCT_PRAGMA_UNROLL _DPCT_PRAGMA(unroll)
+#else // no pragma unroll
+#define _DPCT_PRAGMA_UNROLL
+#endif
+
+namespace dpct {
+
+struct null_type {};
+
+// Function object to wrap user defined functors to provide compile time "const"
+// workaround for user function objects.
+// The SYCL spec (4.12) states that writing to a function object during a SYCL
+// kernel is undefined behavior.  This wrapper is provided as a compile-time
+// work around, but functors used in SYCL kernels must be `const` in practice.
+template <typename _Op> struct mark_functor_const {
+  mutable _Op op;
+  mark_functor_const() : op() {}
+  mark_functor_const(const _Op &__op) : op(__op) {}
+  mark_functor_const(_Op &&__op) : op(::std::move(__op)) {}
+  template <typename... _T> auto operator()(_T &&...x) const {
+    return op(std::forward<_T>(x)...);
+  }
+};
+
+namespace internal {
+
+template <class _ExecPolicy, class _T>
+using enable_if_execution_policy =
+    typename std::enable_if<oneapi::dpl::execution::is_execution_policy<
+                                typename std::decay<_ExecPolicy>::type>::value,
+                            _T>::type;
+
+template <typename _T> struct is_hetero_execution_policy : ::std::false_type {};
+
+template <typename... PolicyParams>
+struct is_hetero_execution_policy<
+    oneapi::dpl::execution::device_policy<PolicyParams...>> : ::std::true_type {
+};
+
+template <typename _T> struct is_fpga_execution_policy : ::std::false_type {};
+
+#if _ONEDPL_FPGA_DEVICE
+template <unsigned int unroll_factor, typename... PolicyParams>
+struct is_hetero_execution_policy<
+    execution::fpga_policy<unroll_factor, PolicyParams...>> : ::std::true_type {
+};
+#endif
+
+template <class _ExecPolicy, class _T>
+using enable_if_hetero_execution_policy = typename std::enable_if<
+    is_hetero_execution_policy<typename std::decay<_ExecPolicy>::type>::value,
+    _T>::type;
+
+#if _ONEDPL_CPP14_INTEGER_SEQUENCE_PRESENT
+
+template <std::size_t... _Sp>
+using index_sequence = std::index_sequence<_Sp...>;
+template <std::size_t _Np>
+using make_index_sequence = std::make_index_sequence<_Np>;
+
+#else
+
+template <std::size_t... _Sp> class index_sequence {};
+
+template <std::size_t _Np, std::size_t... _Sp>
+struct make_index_sequence_impl
+    : make_index_sequence_impl<_Np - 1, _Np - 1, _Sp...> {};
+
+template <std::size_t... _Sp> struct make_index_sequence_impl<0, _Sp...> {
+  using type = index_sequence<_Sp...>;
+};
+
+template <std::size_t _Np>
+using make_index_sequence = typename make_index_sequence_impl<_Np>::type;
+#endif
+
+// Minimal buffer implementations for temporary storage in mapping rules
+// Some of our algorithms need to start with raw memory buffer,
+// not an initialized array, because initialization/destruction
+// would make the span be at least O(N).
+#if ONEDPL_USE_DPCPP_BACKEND
+template <typename _Tp> class __buffer {
+  sycl::buffer<_Tp, 1> __buf;
+
+  __buffer(const __buffer &) = delete;
+
+  void operator=(const __buffer &) = delete;
+
+public:
+  // Try to obtain buffer of given size to store objects of _Tp type
+  __buffer(std::size_t __n) : __buf(sycl::range<1>(__n)) {}
+
+  // Return pointer to buffer, or  NULL if buffer could not be obtained.
+  auto get() -> decltype(oneapi::dpl::begin(__buf)) const {
+    return oneapi::dpl::begin(__buf);
+  }
+};
+#else
+template <typename _Tp> class __buffer {
+  std::unique_ptr<_Tp> _M_ptr;
+
+  __buffer(const __buffer &) = delete;
+
+  void operator=(const __buffer &) = delete;
+
+public:
+  // Try to obtain buffer of given size to store objects of _Tp type
+  __buffer(const std::size_t __n) : _M_ptr(new _Tp[__n]) {}
+
+  // Return pointer to buffer, or  NULL if buffer could not be obtained.
+  _Tp *get() const { return _M_ptr.get(); }
+};
+#endif
+
+// Implements C++14 std::less<void> specialization to allow parameter type
+// deduction.
+class __less {
+public:
+  template <typename _Xp, typename _Yp>
+  bool operator()(_Xp &&__x, _Yp &&__y) const {
+    return std::forward<_Xp>(__x) < std::forward<_Yp>(__y);
+  }
+};
+
+template <typename Policy, typename NewName> struct rebind_policy {
+  using type = Policy;
+};
+
+template <typename KernelName, typename NewName>
+struct rebind_policy<oneapi::dpl::execution::device_policy<KernelName>,
+                     NewName> {
+  using type = oneapi::dpl::execution::device_policy<NewName>;
+};
+
+#if _ONEDPL_FPGA_DEVICE
+template <unsigned int factor, typename KernelName, typename NewName>
+struct rebind_policy<oneapi::dpl::execution::fpga_policy<factor, KernelName>,
+                     NewName> {
+  using type = oneapi::dpl::execution::fpga_policy<factor, NewName>;
+};
+#endif
+
+template <typename T1, typename T2,
+          typename R1 = typename std::iterator_traits<T1>::reference,
+          typename R2 = typename std::iterator_traits<T2>::reference>
+struct perm_fun {
+  typedef R2 result_of;
+  perm_fun(T1 input) : source(input) {}
+
+  R2 operator()(R1 x) const { return *(source + x); }
+
+private:
+  T1 source;
+};
+
+// Functor compares first element (key) from tied sequence.
+template <typename Compare = class internal::__less> struct compare_key_fun {
+  typedef bool result_of;
+  compare_key_fun(Compare _comp = internal::__less()) : comp(_comp) {}
+
+  template <typename _T1, typename _T2>
+  result_of operator()(_T1 &&a, _T2 &&b) const {
+    using std::get;
+    return comp(get<0>(a), get<0>(b));
+  }
+
+private:
+  mutable Compare comp;
+};
+
+// Functor evaluates second element of tied sequence with predicate.
+// Used by: copy_if, remove_copy_if, stable_partition_copy
+// Lambda:
+template <typename Predicate> struct predicate_key_fun {
+  typedef bool result_of;
+  predicate_key_fun(Predicate _pred) : pred(_pred) {}
+
+  template <typename _T1> result_of operator()(_T1 &&a) const {
+    using std::get;
+    return pred(get<1>(a));
+  }
+
+private:
+  mutable Predicate pred;
+};
+
+// Used by: remove_if
+template <typename Predicate> struct negate_predicate_key_fun {
+  typedef bool result_of;
+  negate_predicate_key_fun(Predicate _pred) : pred(_pred) {}
+
+  template <typename _T1> result_of operator()(_T1 &&a) const {
+    using std::get;
+    return !pred(get<1>(a));
+  }
+
+private:
+  mutable Predicate pred;
+};
+
+template <typename T> struct sequence_fun {
+  using result_type = T;
+  sequence_fun(T _init, T _step) : init(_init), step(_step) {}
+
+  template <typename _T> result_type operator()(_T &&i) const {
+    return static_cast<T>(init + step * i);
+  }
+
+private:
+  const T init;
+  const T step;
+};
+
+//[binary_pred](Ref a, Ref b){ return(binary_pred(get<0>(a),get<0>(b)));
+template <typename Predicate> struct unique_fun {
+  typedef bool result_of;
+  unique_fun(Predicate _pred) : pred(_pred) {}
+  template <typename _T> result_of operator()(_T &&a, _T &&b) const {
+    using std::get;
+    return pred(get<0>(a), get<0>(b));
+  }
+
+private:
+  mutable Predicate pred;
+};
+
+// Lambda: [pred, &new_value](Ref1 a, Ref2 s) {return pred(s) ? new_value : a;
+// });
+template <typename T, typename Predicate> struct replace_if_fun {
+public:
+  typedef T result_of;
+  replace_if_fun(Predicate _pred, T _new_value)
+      : pred(_pred), new_value(_new_value) {}
+
+  template <typename _T1, typename _T2> T operator()(_T1 &&a, _T2 &&s) const {
+    return pred(s) ? new_value : a;
+  }
+
+private:
+  mutable Predicate pred;
+  const T new_value;
+};
+
+//[pred,op](Ref a){return pred(a) ? op(a) : a; }
+template <typename T, typename Predicate, typename Operator>
+struct transform_if_fun {
+  transform_if_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
+  template <typename _T>
+  void operator()(_T&& t) const {
+    using std::get;
+    if (pred(get<0>(t)))
+      get<1>(t) = op(get<0>(t));
+  }
+
+private:
+  mutable Predicate pred;
+  mutable Operator op;
+};
+
+//[pred, op](Ref1 a, Ref2 s) { return pred(s) ? op(a) : a; });
+template <typename T, typename Predicate, typename Operator>
+struct transform_if_unary_zip_mask_fun {
+  transform_if_unary_zip_mask_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
+  template <typename _T>
+  void operator()(_T&& t) const {
+    using std::get;
+    if (pred(get<1>(t)))
+      get<2>(t) = op(get<0>(t));
+  }
+
+private:
+  mutable Predicate pred;
+  mutable Operator op;
+};
+
+template <typename T, typename Predicate, typename BinaryOperation>
+class transform_if_zip_mask_fun {
+public:
+  transform_if_zip_mask_fun(Predicate _pred = oneapi::dpl::identity(),
+                            BinaryOperation _op = oneapi::dpl::identity())
+      : pred(_pred), op(_op) {}
+  template <typename _T> void operator()(_T &&t) const {
+    using std::get;
+    if (pred(get<2>(t)))
+      get<3>(t) = op(get<0>(t), get<1>(t));
+  }
+
+private:
+  mutable Predicate pred;
+  mutable BinaryOperation op;
+};
+
+// This following code is similar to a section of code in
+// oneDPL/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
+// It has a similar approach, and could be consolidated.
+// Outside of some differences in approach, there are two significant
+// differences in function.
+//
+// 1) This code allows the output type of the bit range translation to be fit
+// into to the minimal type required to provide that many bits. The code in
+// oneDPL to calculate the bucket for the radix is similar but its output is
+// always std::uint32_t.  The assumption that the bit range desired will fit in
+// 32 bits is not true for this code.
+//
+// 2) This code ensures that for floating point type, -0.0f and 0.0f map to the
+// same value.  This allows the output of this translation to be used to provide
+// a sort which ensures the stability of these values for floating point types.
+
+template <int N> struct uint_byte_map {};
+template <> struct uint_byte_map<1> { using type = uint8_t; };
+template <> struct uint_byte_map<2> { using type = uint16_t; };
+template <> struct uint_byte_map<4> { using type = uint32_t; };
+template <> struct uint_byte_map<8> { using type = uint64_t; };
+
+template <typename T> struct uint_map {
+  using type = typename uint_byte_map<sizeof(T)>::type;
+};
+
+template <typename T, typename OutKeyT> class translate_key {
+  using uint_type_t = typename uint_map<T>::type;
+
+public:
+  translate_key(int begin_bit, int end_bit) {
+    shift = begin_bit;
+    mask = ~OutKeyT(0); // all ones
+    mask = mask >> (sizeof(OutKeyT) * 8 -
+                    (end_bit - begin_bit));           // setup appropriate mask
+    flip_sign = uint_type_t(1) << (sizeof(uint_type_t) * 8 - 1); // sign bit
+    flip_key = ~uint_type_t(0);                       // 0xF...F
+  }
+
+  inline OutKeyT operator()(const T &key) const {
+    uint_type_t intermediate;
+    if constexpr (std::is_floating_point<T>::value) {
+        // normal case (both -0.0f and 0.0f equal -0.0f)
+        if (key != T(-0.0f)) {
+        uint_type_t is_negative = reinterpret_cast<const uint_type_t &>(key) >>
+              (sizeof(uint_type_t) * 8 - 1);
+          intermediate = reinterpret_cast<const uint_type_t &>(key) ^
+                         ((is_negative * flip_key) | flip_sign);
+        } else // special case for -0.0f to keep stability with 0.0f
+        {
+          T negzero = T(-0.0f);
+          intermediate = reinterpret_cast<const uint_type_t &>(negzero);
+        }
+    } else if constexpr (std::is_signed<T>::value) {
+        intermediate = reinterpret_cast<const uint_type_t &>(key) ^ flip_sign;
+    } else {
+      intermediate = key;
+    }
+
+    return static_cast<OutKeyT>(intermediate >> shift) &
+           mask; // shift, cast, and mask
+  }
+
+private:
+  uint8_t shift;
+  OutKeyT mask;
+  uint_type_t flip_sign;
+  uint_type_t flip_key;
+};
+
+// Unary operator that returns reference to its argument. Ported from
+// oneDPL: oneapi/dpl/pstl/utils.h
+struct no_op_fun {
+  template <typename Tp> Tp &&operator()(Tp &&a) const {
+    return ::std::forward<Tp>(a);
+  }
+};
+
+// Unary functor which composes a pair of functors by calling them in succession
+// on an input
+template <typename FunctorInner, typename FunctorOuter>
+struct __composition_functor {
+  __composition_functor(FunctorInner in, FunctorOuter out)
+      : _in(in), _out(out) {}
+  template <typename T> T operator()(const T &i) const {
+    return _out(_in(i));
+  }
+  FunctorInner _in;
+  FunctorOuter _out;
+};
+
+// Unary functor which maps an index of a ROI into a 2D flattened array
+template <typename OffsetT> struct __roi_2d_index_functor {
+  __roi_2d_index_functor(const OffsetT &num_cols,
+                         const ::std::size_t &row_stride)
+      : _num_cols(num_cols), _row_stride(row_stride) {}
+
+  template <typename Index> Index operator()(const Index &i) const {
+    return _row_stride * (i / _num_cols) + (i % _num_cols);
+  }
+
+  OffsetT _num_cols;
+  ::std::size_t _row_stride;
+};
+
+// Unary functor which maps and index into an interleaved array by its active
+// channel
+template <typename OffsetT> struct __interleaved_index_functor {
+  __interleaved_index_functor(const OffsetT &total_channels,
+                              const OffsetT &active_channel)
+      : _total_channels(total_channels), _active_channel(active_channel) {}
+
+  template <typename Index> Index operator()(const Index &i) const {
+    return i * _total_channels + _active_channel;
+  }
+
+  OffsetT _total_channels;
+  OffsetT _active_channel;
+};
+
+} // end namespace internal
+
+} // end namespace dpct
+
+#endif
diff --git a/dpct/dpl_extras/iterators.h b/dpct/dpl_extras/iterators.h
new file mode 100644
index 0000000000000..2e1d10986728e
--- /dev/null
+++ b/dpct/dpl_extras/iterators.h
@@ -0,0 +1,347 @@
+//==---- iterators.h ------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_ITERATORS_H__
+#define __DPCT_ITERATORS_H__
+
+#include <oneapi/dpl/iterator>
+
+#include "functional.h"
+
+namespace dpct {
+
+namespace internal {
+
+// Wrapper class returned from a dereferenced transform_iterator which was
+// created using
+//  make_transform_output_iterator(). Used to apply the supplied transform
+//  function when writing into an object of this class.
+//
+// Example:
+// int a[] = {0, 1, 2, 3, 4};
+// int* p = a;
+// auto f = [](auto v) {return v*v;};
+// auto tr_out = dpct::make_transform_output_iterator(p+1, f);
+// auto wrap = *tr_out;         // wrap is a transform_output_ref_wrapper
+// std::cout<<*(p+1)<<std::endl;  // '1'
+// wrap = 2;                    // apply function, store 2*2=4
+// std::cout<<*(p+1)<<std::endl;  // '4'
+template <typename T, typename _UnaryFunc> class transform_output_ref_wrapper {
+private:
+  T __my_reference_;
+  _UnaryFunc __my_unary_func_;
+
+public:
+  template <typename U>
+  transform_output_ref_wrapper(U &&__reference, _UnaryFunc __unary_func)
+      : __my_reference_(std::forward<U>(__reference)),
+        __my_unary_func_(__unary_func) {}
+
+  // When writing to an object of this type, apply the supplied unary function,
+  // then write to the wrapped reference
+  template <typename UnaryInputType>
+  transform_output_ref_wrapper &operator=(const UnaryInputType &e) {
+    __my_reference_ = __my_unary_func_(e);
+    return *this;
+  }
+};
+
+// Unary functor to create a transform_output_reference_wrapper when a
+// transform_iterator is dereferenced, so that a
+// the supplied unary function may be applied on write, resulting in a
+// transform_output_iterator
+template <typename _UnaryFunc> struct _Unary_Out {
+  _Unary_Out(_UnaryFunc __f_) : __f(__f_) {}
+  _UnaryFunc __f;
+  template <typename T> auto operator()(T &&val) const {
+    return transform_output_ref_wrapper<T, _UnaryFunc>(std::forward<T>(val),
+                                                       __f);
+  }
+};
+
+} // end namespace internal
+
+using std::advance;
+
+using std::distance;
+
+template <typename T>
+oneapi::dpl::counting_iterator<T> make_counting_iterator(const T &input) {
+  return oneapi::dpl::counting_iterator<T>(input);
+}
+
+template <typename _Tp> class constant_iterator {
+public:
+  typedef std::false_type is_hetero;
+  typedef std::true_type is_passed_directly;
+  typedef std::ptrdiff_t difference_type;
+  typedef _Tp value_type;
+  typedef _Tp *pointer;
+  // There is no storage behind the iterator, so we return a value instead of
+  // reference.
+  typedef const _Tp reference;
+  typedef const _Tp const_reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  explicit constant_iterator(_Tp __init)
+      : __my_value_(__init), __my_counter_(0) {}
+
+private:
+  // used to construct iterator instances with different counter values required
+  // by arithmetic operators
+  constant_iterator(const _Tp &__value, const difference_type &__offset)
+      : __my_value_(__value), __my_counter_(__offset) {}
+
+public:
+  // non-const variants of access operators are not provided so unintended
+  // writes are caught at compile time.
+  const_reference operator*() const { return __my_value_; }
+  const_reference operator[](difference_type) const { return __my_value_; }
+
+  difference_type operator-(const constant_iterator &__it) const {
+    return __my_counter_ - __it.__my_counter_;
+  }
+
+  constant_iterator &operator+=(difference_type __forward) {
+    __my_counter_ += __forward;
+    return *this;
+  }
+  constant_iterator &operator-=(difference_type __backward) {
+    return *this += -__backward;
+  }
+  constant_iterator &operator++() { return *this += 1; }
+  constant_iterator &operator--() { return *this -= 1; }
+
+  constant_iterator operator++(int) {
+    constant_iterator __it(*this);
+    ++(*this);
+    return __it;
+  }
+  constant_iterator operator--(int) {
+    constant_iterator __it(*this);
+    --(*this);
+    return __it;
+  }
+
+  constant_iterator operator-(difference_type __backward) const {
+    return constant_iterator(__my_value_, __my_counter_ - __backward);
+  }
+  constant_iterator operator+(difference_type __forward) const {
+    return constant_iterator(__my_value_, __my_counter_ + __forward);
+  }
+  friend constant_iterator operator+(difference_type __forward,
+                                     const constant_iterator __it) {
+    return __it + __forward;
+  }
+
+  bool operator==(const constant_iterator &__it) const {
+    return __my_value_ == __it.__my_value_ &&
+           this->__my_counter_ == __it.__my_counter_;
+  }
+  bool operator!=(const constant_iterator &__it) const {
+    return !(*this == __it);
+  }
+  bool operator<(const constant_iterator &__it) const {
+    return *this - __it < 0;
+  }
+  bool operator>(const constant_iterator &__it) const { return __it < *this; }
+  bool operator<=(const constant_iterator &__it) const {
+    return !(*this > __it);
+  }
+  bool operator>=(const constant_iterator &__it) const {
+    return !(*this < __it);
+  }
+
+private:
+  _Tp __my_value_;
+  uint64_t __my_counter_;
+};
+
+template <typename _Tp>
+constant_iterator<_Tp> make_constant_iterator(_Tp __value) {
+  return constant_iterator<_Tp>(__value);
+}
+
+// key_value_pair class to represent a key and value, specifically a
+// dereferenced arg_index_input_iterator
+template <typename _KeyTp, typename _ValueTp> class key_value_pair {
+public:
+  key_value_pair() = default;
+
+  key_value_pair(const _KeyTp &_key, const _ValueTp &_value)
+      : key(_key), value(_value) {}
+
+  bool operator==(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
+    return (key == _kvp.key) && (value == _kvp.value);
+  }
+
+  bool operator!=(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
+    return (key != _kvp.key) || (value != _kvp.value);
+  }
+
+  _KeyTp key;
+  _ValueTp value;
+};
+
+namespace detail {
+
+template <typename KeyTp, typename _ValueTp> struct make_key_value_pair {
+  template <typename ValRefTp>
+  key_value_pair<KeyTp, _ValueTp>
+  operator()(const oneapi::dpl::__internal::tuple<KeyTp, ValRefTp> &tup) const {
+    return ::dpct::key_value_pair<KeyTp, _ValueTp>(::std::get<0>(tup),
+                                                   ::std::get<1>(tup));
+  }
+};
+
+template <class T> struct __zip_iterator_impl;
+template <class... Ts> struct __zip_iterator_impl<std::tuple<Ts...>> {
+  using type = oneapi::dpl::zip_iterator<Ts...>;
+};
+
+} // end namespace detail
+
+// dpct::zip_iterator can only accept std::tuple type as template argument for
+// compatibility purpose. Please use oneapi::dpl::zip_iterator if you want to
+// pass iterator's types directly.
+template <typename... Ts>
+using zip_iterator = typename detail::__zip_iterator_impl<Ts...>::type;
+
+// arg_index_input_iterator is an iterator over a input iterator, with a index.
+// When dereferenced, it returns a key_value_pair, which can be interrogated for
+// the index key or the value from the input iterator
+template <typename InputIteratorT, typename OffsetT = ptrdiff_t,
+          typename OutputValueT =
+              typename ::std::iterator_traits<InputIteratorT>::value_type>
+class arg_index_input_iterator
+    : public oneapi::dpl::transform_iterator<
+          oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
+                                    InputIteratorT>,
+          detail::make_key_value_pair<OffsetT, OutputValueT>> {
+  using arg_index_input_iterator_wrap = oneapi::dpl::transform_iterator<
+      oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
+                                InputIteratorT>,
+      detail::make_key_value_pair<OffsetT, OutputValueT>>;
+
+public:
+  typedef OffsetT difference_type;
+
+  // signal to __get_sycl_range that this iterator is as a direct pass iterator
+  using is_zip = ::std::true_type;
+
+  arg_index_input_iterator(const arg_index_input_iterator_wrap &__arg_wrap)
+      : arg_index_input_iterator_wrap(__arg_wrap) {}
+  arg_index_input_iterator(InputIteratorT __iter)
+      : arg_index_input_iterator_wrap(
+            oneapi::dpl::make_zip_iterator(
+                oneapi::dpl::counting_iterator(OffsetT(0)), __iter),
+            detail::make_key_value_pair<OffsetT, OutputValueT>()) {}
+
+  arg_index_input_iterator &operator=(const arg_index_input_iterator &__input) {
+    arg_index_input_iterator_wrap::operator=(__input);
+    return *this;
+  }
+  arg_index_input_iterator &operator++() {
+    arg_index_input_iterator_wrap::operator++();
+    return *this;
+  }
+  arg_index_input_iterator &operator--() {
+    arg_index_input_iterator_wrap::operator--();
+    return *this;
+  }
+  arg_index_input_iterator operator++(int) {
+    arg_index_input_iterator __it(*this);
+    ++(*this);
+    return __it;
+  }
+  arg_index_input_iterator operator--(int) {
+    arg_index_input_iterator __it(*this);
+    --(*this);
+    return __it;
+  }
+  arg_index_input_iterator operator+(difference_type __forward) const {
+    return arg_index_input_iterator(
+        arg_index_input_iterator_wrap::operator+(__forward));
+  }
+  arg_index_input_iterator operator-(difference_type __backward) const {
+    return arg_index_input_iterator(
+        arg_index_input_iterator_wrap::operator-(__backward));
+  }
+  arg_index_input_iterator &operator+=(difference_type __forward) {
+    arg_index_input_iterator_wrap::operator+=(__forward);
+    return *this;
+  }
+  arg_index_input_iterator &operator-=(difference_type __backward) {
+    arg_index_input_iterator_wrap::operator-=(__backward);
+    return *this;
+  }
+
+  friend arg_index_input_iterator
+  operator+(difference_type __forward, const arg_index_input_iterator &__it) {
+    return __it + __forward;
+  }
+
+  difference_type operator-(const arg_index_input_iterator &__it) const {
+    return arg_index_input_iterator_wrap::operator-(__it);
+  }
+  bool operator==(const arg_index_input_iterator &__it) const {
+    return arg_index_input_iterator_wrap::operator==(__it);
+  }
+  bool operator!=(const arg_index_input_iterator &__it) const {
+    return !(*this == __it);
+  }
+  bool operator<(const arg_index_input_iterator &__it) const {
+    return *this - __it < 0;
+  }
+  bool operator>(const arg_index_input_iterator &__it) const {
+    return __it < *this;
+  }
+  bool operator<=(const arg_index_input_iterator &__it) const {
+    return !(*this > __it);
+  }
+  bool operator>=(const arg_index_input_iterator &__it) const {
+    return !(*this < __it);
+  }
+
+  // returns an arg_index_input_iterator with the same iter position, but a
+  // count reset to 0
+  arg_index_input_iterator create_normalized() {
+    return arg_index_input_iterator(
+        ::std::get<1>(arg_index_input_iterator_wrap::base().base()));
+  }
+};
+
+template <typename IterT> struct io_iterator_pair {
+  inline io_iterator_pair() : selector(false) {}
+
+  inline io_iterator_pair(const IterT &first, const IterT &second)
+      : selector(false) {
+    iter[0] = first;
+    iter[1] = second;
+  }
+
+  inline IterT first() const { return selector ? iter[1] : iter[0]; }
+
+  inline IterT second() const { return selector ? iter[0] : iter[1]; }
+
+  inline void swap() { selector = !selector; }
+
+  bool selector;
+
+  IterT iter[2];
+};
+
+template <typename _Iter, typename _UnaryFunc>
+auto make_transform_output_iterator(_Iter __it, _UnaryFunc __unary_func) {
+  return oneapi::dpl::transform_iterator(
+      __it, internal::_Unary_Out<_UnaryFunc>(__unary_func));
+}
+
+} // end namespace dpct
+
+#endif
diff --git a/dpct/dpl_extras/memory.h b/dpct/dpl_extras/memory.h
new file mode 100644
index 0000000000000..08b965133f519
--- /dev/null
+++ b/dpct/dpl_extras/memory.h
@@ -0,0 +1,1024 @@
+//==---- memory.h ---------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_MEMORY_H__
+#define __DPCT_MEMORY_H__
+
+#include <sycl/sycl.hpp>
+#include <oneapi/dpl/memory>
+#include "functional.h"
+
+// Memory management section:
+// device_pointer, device_reference, swap, device_iterator, malloc_device,
+// device_new, free_device, device_delete
+namespace dpct {
+namespace detail {
+template <typename T>
+struct make_allocatable
+{
+  using type = T;
+};
+
+template <>
+struct make_allocatable<void>
+{
+  using type = dpct::byte_t;
+};
+
+#if defined(__LIBSYCL_MAJOR_VERSION) && defined(__LIBSYCL_MINOR_VERSION) &&    \
+    defined(__LIBSYCL_PATCH_VERSION)
+#define _DPCT_LIBSYCL_VERSION                                                  \
+  (__LIBSYCL_MAJOR_VERSION * 10000 + __LIBSYCL_MINOR_VERSION * 100 +           \
+   __LIBSYCL_PATCH_VERSION)
+#else
+#define _DPCT_LIBSYCL_VERSION 0
+#endif
+
+template <typename _DataT>
+using __buffer_allocator =
+#if _DPCT_LIBSYCL_VERSION >= 60000
+    sycl::buffer_allocator<typename make_allocatable<_DataT>::type>;
+#else
+    sycl::buffer_allocator;
+#endif
+} // namespace detail
+
+#ifdef DPCT_USM_LEVEL_NONE
+template <typename T, sycl::access_mode Mode = sycl::access_mode::read_write,
+          typename Allocator = detail::__buffer_allocator<T>>
+class device_pointer;
+#else
+template <typename T> class device_pointer;
+#endif
+
+template <typename T> struct device_reference {
+  using pointer = device_pointer<T>;
+  using value_type = T;
+  template <typename OtherT>
+  device_reference(const device_reference<OtherT> &input)
+      : value(input.value) {}
+  device_reference(const pointer &input) : value((*input).value) {}
+  device_reference(value_type &input) : value(input) {}
+  template <typename OtherT>
+  device_reference &operator=(const device_reference<OtherT> &input) {
+    value = input;
+    return *this;
+  };
+  device_reference &operator=(const device_reference &input) {
+    T val = input.value;
+    value = val;
+    return *this;
+  };
+  device_reference &operator=(const value_type &x) {
+    value = x;
+    return *this;
+  };
+  pointer operator&() const { return pointer(&value); };
+  operator value_type() const { return T(value); }
+  device_reference &operator++() {
+    ++value;
+    return *this;
+  };
+  device_reference &operator--() {
+    --value;
+    return *this;
+  };
+  device_reference operator++(int) {
+    device_reference ref(*this);
+    ++(*this);
+    return ref;
+  };
+  device_reference operator--(int) {
+    device_reference ref(*this);
+    --(*this);
+    return ref;
+  };
+  device_reference &operator+=(const T &input) {
+    value += input;
+    return *this;
+  };
+  device_reference &operator-=(const T &input) {
+    value -= input;
+    return *this;
+  };
+  device_reference &operator*=(const T &input) {
+    value *= input;
+    return *this;
+  };
+  device_reference &operator/=(const T &input) {
+    value /= input;
+    return *this;
+  };
+  device_reference &operator%=(const T &input) {
+    value %= input;
+    return *this;
+  };
+  device_reference &operator&=(const T &input) {
+    value &= input;
+    return *this;
+  };
+  device_reference &operator|=(const T &input) {
+    value |= input;
+    return *this;
+  };
+  device_reference &operator^=(const T &input) {
+    value ^= input;
+    return *this;
+  };
+  device_reference &operator<<=(const T &input) {
+    value <<= input;
+    return *this;
+  };
+  device_reference &operator>>=(const T &input) {
+    value >>= input;
+    return *this;
+  };
+  void swap(device_reference &input) {
+    T tmp = (*this);
+    *this = (input);
+    input = (tmp);
+  }
+  T &value;
+};
+
+template <typename T>
+void swap(device_reference<T> &x, device_reference<T> &y) {
+  x.swap(y);
+}
+
+template <typename T> void swap(T &x, T &y) {
+  T tmp = x;
+  x = y;
+  y = tmp;
+}
+
+template <typename T>
+::std::ostream &operator<<(::std::ostream &out,
+                           const device_reference<T> &ref) {
+  return out << T(ref);
+}
+
+namespace internal {
+// struct for checking if iterator is heterogeneous or not
+template <typename Iter,
+          typename Void = void> // for non-heterogeneous iterators
+struct is_hetero_iterator : std::false_type {};
+
+template <typename Iter> // for heterogeneous iterators
+struct is_hetero_iterator<
+    Iter, typename std::enable_if<Iter::is_hetero::value, void>::type>
+    : std::true_type {};
+} // namespace internal
+
+#ifdef DPCT_USM_LEVEL_NONE
+// Must be forward declared due to default argument
+template <typename T>
+device_pointer<T> device_new(device_pointer<void>, const T &,
+                             const std::size_t = 1);
+
+template <typename T, sycl::access_mode Mode, typename Allocator>
+class device_iterator;
+
+template <typename ValueType, typename Allocator, typename Derived>
+class device_pointer_base {
+protected:
+  sycl::buffer<ValueType, 1, Allocator> buffer;
+  std::size_t idx;
+
+  // Declare friend to give access to protected buffer and idx members
+  template <typename T>
+  friend device_pointer<T> device_new(device_pointer<void>, const T &,
+                                      const std::size_t);
+
+public:
+  using pointer = ValueType *;
+  using difference_type = std::make_signed<std::size_t>::type;
+
+  device_pointer_base(sycl::buffer<ValueType, 1> in, std::size_t i = 0)
+      : buffer(in), idx(i) {}
+#ifdef __USE_DPCT
+  template <typename OtherT>
+  device_pointer_base(OtherT *ptr)
+      : buffer(
+            dpct::detail::mem_mgr::instance()
+                .translate_ptr(ptr)
+                .buffer.template reinterpret<ValueType, 1>(sycl::range<1>(
+                    dpct::detail::mem_mgr::instance().translate_ptr(ptr).size /
+                    sizeof(ValueType)))),
+        idx(ptr - (ValueType*)dpct::detail::mem_mgr::instance()
+                .translate_ptr(ptr).alloc_ptr) {}
+#endif
+  device_pointer_base(const std::size_t count)
+      : buffer(sycl::range<1>(count / sizeof(ValueType))), idx() {}
+  // buffer has no default ctor we pass zero-range to create an empty buffer
+  device_pointer_base() : buffer(sycl::range<1>(0)) {}
+  device_pointer_base(const device_pointer_base &in)
+      : buffer(in.buffer), idx(in.idx) {}
+  pointer get() const {
+    auto res =
+        (const_cast<device_pointer_base *>(this)
+             ->buffer.template get_access<sycl::access_mode::read_write>())
+            .get_pointer();
+    return res + idx;
+  }
+  operator ValueType *() {
+    auto res = (buffer.template get_access<sycl::access_mode::read_write>())
+                   .get_pointer();
+    return res + idx;
+  }
+  operator ValueType *() const {
+    auto res =
+        (const_cast<device_pointer_base *>(this)
+             ->buffer.template get_access<sycl::access_mode::read_write>())
+            .get_pointer();
+    return res + idx;
+  }
+  Derived operator+(difference_type forward) const {
+    return Derived{buffer, idx + forward};
+  }
+  Derived operator-(difference_type backward) const {
+    return Derived{buffer, idx - backward};
+  }
+  Derived operator++(int) {
+    Derived p(buffer, idx);
+    idx += 1;
+    return p;
+  }
+  Derived operator--(int) {
+    Derived p(buffer, idx);
+    idx -= 1;
+    return p;
+  }
+  difference_type operator-(const Derived &it) const { return idx - it.idx; }
+  template <typename OtherIterator>
+  typename std::enable_if<internal::is_hetero_iterator<OtherIterator>::value,
+                          difference_type>::type
+  operator-(const OtherIterator &it) const {
+    return idx - std::distance(oneapi::dpl::begin(buffer), it);
+  }
+
+  std::size_t get_idx() const { return idx; } // required
+
+  sycl::buffer<ValueType, 1, Allocator> get_buffer() {
+    return buffer;
+  } // required
+};
+
+template <sycl::access_mode Mode, typename Allocator>
+class device_pointer<void, Mode, Allocator>
+    : public device_pointer_base<dpct::byte_t, Allocator,
+                                 device_pointer<void, Mode, Allocator>> {
+private:
+  using base_type =
+      device_pointer_base<dpct::byte_t, Allocator, device_pointer>;
+
+public:
+  using value_type = dpct::byte_t;
+  using difference_type = std::make_signed<std::size_t>::type;
+  using pointer = void *;
+  using reference = value_type &;
+  using iterator_category = std::random_access_iterator_tag;
+  using is_hetero = std::true_type; // required
+  using is_passed_directly = std::false_type;
+  static constexpr sycl::access_mode mode = Mode; // required
+
+  device_pointer(sycl::buffer<value_type, 1> in, std::size_t i = 0)
+      : base_type(in, i) {}
+#ifdef __USE_DPCT
+  template <typename OtherT> device_pointer(OtherT *ptr) : base_type(ptr) {}
+#endif
+  // needed for malloc_device, count is number of bytes to allocate
+  device_pointer(const std::size_t count) : base_type(count) {}
+  device_pointer() : base_type() {}
+  device_pointer(const device_pointer &in) : base_type(in) {}
+  device_pointer &operator+=(difference_type forward) {
+    this->idx += forward;
+    return *this;
+  }
+  device_pointer &operator-=(difference_type backward) {
+    this->idx -= backward;
+    return *this;
+  }
+  // include operators from base class
+  using base_type::operator++;
+  using base_type::operator--;
+  device_pointer &operator++() {
+    this->idx += 1;
+    return *this;
+  }
+  device_pointer &operator--() {
+    this->idx -= 1;
+    return *this;
+  }
+};
+
+template <typename T, sycl::access_mode Mode, typename Allocator>
+class device_pointer
+    : public device_pointer_base<T, Allocator,
+                                 device_pointer<T, Mode, Allocator>> {
+private:
+  using base_type = device_pointer_base<T, Allocator, device_pointer>;
+
+public:
+  using value_type = T;
+  using difference_type = std::make_signed<std::size_t>::type;
+  using pointer = T *;
+  using reference = T &;
+  using iterator_category = std::random_access_iterator_tag;
+  using is_hetero = std::true_type; // required
+  using is_passed_directly = std::false_type;
+  static constexpr sycl::access_mode mode = Mode; // required
+
+  device_pointer(sycl::buffer<T, 1> in, std::size_t i = 0) : base_type(in, i) {}
+#ifdef __USE_DPCT
+  template <typename OtherT> device_pointer(OtherT *ptr) : base_type(ptr) {}
+#endif
+  // needed for malloc_device, count is number of bytes to allocate
+  device_pointer(const std::size_t count) : base_type(count) {}
+  device_pointer() : base_type() {}
+  device_pointer(const device_pointer &in) : base_type(in) {}
+  device_pointer &operator+=(difference_type forward) {
+    this->idx += forward;
+    return *this;
+  }
+  device_pointer &operator-=(difference_type backward) {
+    this->idx -= backward;
+    return *this;
+  }
+  operator device_pointer<void>() {
+    auto converted_buf = (this->buffer)
+                             .template reinterpret<dpct::byte_t>(sycl::range<1>(
+                                 sizeof(value_type) * this->buffer.size()));
+    return device_pointer<void>(converted_buf, this->idx);
+  }
+  // include operators from base class
+  using base_type::operator++;
+  using base_type::operator--;
+  device_pointer &operator++() {
+    this->idx += 1;
+    return *this;
+  }
+  device_pointer &operator--() {
+    this->idx -= 1;
+    return *this;
+  }
+};
+#else
+template <typename T> class device_iterator;
+
+template <typename ValueType, typename Derived> class device_pointer_base {
+protected:
+  ValueType *ptr;
+
+public:
+  using pointer = ValueType *;
+  using difference_type = std::make_signed<std::size_t>::type;
+
+  device_pointer_base(ValueType *p) : ptr(p) {}
+  device_pointer_base(const std::size_t count) {
+    sycl::queue default_queue = dpct::get_default_queue();
+    ptr = static_cast<ValueType *>(sycl::malloc_shared(
+        count, default_queue.get_device(), default_queue.get_context()));
+  }
+  device_pointer_base() {}
+  pointer get() const { return ptr; }
+  operator ValueType *() { return ptr; }
+  operator ValueType *() const { return ptr; }
+
+  ValueType &operator[](difference_type idx) { return ptr[idx]; }
+  ValueType &operator[](difference_type idx) const { return ptr[idx]; }
+
+  Derived operator+(difference_type forward) const {
+    return Derived{ptr + forward};
+  }
+  Derived operator-(difference_type backward) const {
+    return Derived{ptr - backward};
+  }
+  Derived operator++(int) {
+    Derived p(ptr);
+    ++ptr;
+    return p;
+  }
+  Derived operator--(int) {
+    Derived p(ptr);
+    --ptr;
+    return p;
+  }
+  difference_type operator-(const Derived &it) const { return ptr - it.ptr; }
+};
+
+template <>
+class device_pointer<void>
+    : public device_pointer_base<dpct::byte_t, device_pointer<void>> {
+private:
+  using base_type = device_pointer_base<dpct::byte_t, device_pointer<void>>;
+
+public:
+  using value_type = dpct::byte_t;
+  using difference_type = std::make_signed<std::size_t>::type;
+  using pointer = void *;
+  using reference = value_type &;
+  using const_reference = const value_type &;
+  using iterator_category = std::random_access_iterator_tag;
+  using is_hetero = std::false_type;         // required
+  using is_passed_directly = std::true_type; // required
+
+  device_pointer(void *p) : base_type(static_cast<value_type *>(p)) {}
+  // needed for malloc_device, count is number of bytes to allocate
+  device_pointer(const std::size_t count) : base_type(count) {}
+  device_pointer() : base_type() {}
+  pointer get() const { return static_cast<pointer>(this->ptr); }
+  operator void *() { return this->ptr; }
+  operator void *() const { return this->ptr; }
+
+  // include operators from base class
+  using base_type::operator++;
+  using base_type::operator--;
+  device_pointer &operator++() {
+    ++(this->ptr);
+    return *this;
+  }
+  device_pointer &operator--() {
+    --(this->ptr);
+    return *this;
+  }
+  device_pointer &operator+=(difference_type forward) {
+    this->ptr = this->ptr + forward;
+    return *this;
+  }
+  device_pointer &operator-=(difference_type backward) {
+    this->ptr = this->ptr - backward;
+    return *this;
+  }
+};
+
+template <typename T>
+class device_pointer : public device_pointer_base<T, device_pointer<T>> {
+private:
+  using base_type = device_pointer_base<T, device_pointer<T>>;
+
+public:
+  using value_type = T;
+  using difference_type = std::make_signed<std::size_t>::type;
+  using pointer = T *;
+  using reference = T &;
+  using const_reference = const T &;
+  using iterator_category = std::random_access_iterator_tag;
+  using is_hetero = std::false_type;         // required
+  using is_passed_directly = std::true_type; // required
+
+  device_pointer(T *p) : base_type(p) {}
+  // needed for malloc_device, count is number of bytes to allocate
+  device_pointer(const std::size_t count) : base_type(count) {}
+  device_pointer() : base_type() {}
+  device_pointer &operator=(const device_iterator<T> &in) {
+    this->ptr = static_cast<device_pointer<T>>(in).ptr;
+    return *this;
+  }
+  operator device_pointer<void>() {
+    return device_pointer<void>(static_cast<void *>(this->ptr));
+  }
+  // include operators from base class
+  using base_type::operator++;
+  using base_type::operator--;
+  device_pointer &operator++() {
+    ++(this->ptr);
+    return *this;
+  }
+  device_pointer &operator--() {
+    --(this->ptr);
+    return *this;
+  }
+  device_pointer &operator+=(difference_type forward) {
+    this->ptr = this->ptr + forward;
+    return *this;
+  }
+  device_pointer &operator-=(difference_type backward) {
+    this->ptr = this->ptr - backward;
+    return *this;
+  }
+};
+#endif
+
+#ifdef DPCT_USM_LEVEL_NONE
+template <typename T, sycl::access_mode Mode = sycl::access_mode::read_write,
+          typename Allocator = detail::__buffer_allocator<T>>
+class device_iterator : public device_pointer<T, Mode, Allocator> {
+  using Base = device_pointer<T, Mode, Allocator>;
+
+public:
+  using value_type = T;
+  using difference_type = std::make_signed<std::size_t>::type;
+  using pointer = T *;
+  using reference = T &;
+  using iterator_category = std::random_access_iterator_tag;
+  using is_hetero = std::true_type;                // required
+  using is_passed_directly = std::false_type;      // required
+  static constexpr sycl::access_mode mode = Mode; // required
+
+  device_iterator() : Base() {}
+  device_iterator(sycl::buffer<T, 1, Allocator> vec, std::size_t index)
+      : Base(vec, index) {}
+  device_iterator(const Base &dev_ptr) : Base(dev_ptr) {}
+  template <sycl::access_mode inMode>
+  device_iterator(const device_iterator<T, inMode, Allocator> &in)
+      : Base(in.buffer, in.idx) {} // required for iter_mode
+  device_iterator &operator=(const device_iterator &in) {
+    Base::buffer = in.buffer;
+    Base::idx = in.idx;
+    return *this;
+  }
+
+  reference operator*() const {
+    return const_cast<device_iterator *>(this)
+        ->buffer.template get_access<mode>()[Base::idx];
+  }
+
+  reference operator[](difference_type i) const { return *(*this + i); }
+  device_iterator &operator++() {
+    ++Base::idx;
+    return *this;
+  }
+  device_iterator &operator--() {
+    --Base::idx;
+    return *this;
+  }
+  device_iterator operator++(int) {
+    device_iterator it(*this);
+    ++(*this);
+    return it;
+  }
+  device_iterator operator--(int) {
+    device_iterator it(*this);
+    --(*this);
+    return it;
+  }
+  device_iterator operator+(difference_type forward) const {
+    const auto new_idx = Base::idx + forward;
+    return {Base::buffer, new_idx};
+  }
+  device_iterator &operator+=(difference_type forward) {
+    Base::idx += forward;
+    return *this;
+  }
+  device_iterator operator-(difference_type backward) const {
+    return {Base::buffer, Base::idx - backward};
+  }
+  device_iterator &operator-=(difference_type backward) {
+    Base::idx -= backward;
+    return *this;
+  }
+  friend device_iterator operator+(difference_type forward,
+                                   const device_iterator &it) {
+    return it + forward;
+  }
+  difference_type operator-(const device_iterator &it) const {
+    return Base::idx - it.idx;
+  }
+  template <typename OtherIterator>
+  typename std::enable_if<internal::is_hetero_iterator<OtherIterator>::value,
+                          difference_type>::type
+  operator-(const OtherIterator &it) const {
+    return Base::idx - std::distance(oneapi::dpl::begin(Base::buffer), it);
+  }
+  bool operator==(const device_iterator &it) const { return *this - it == 0; }
+  bool operator!=(const device_iterator &it) const { return !(*this == it); }
+  bool operator<(const device_iterator &it) const { return *this - it < 0; }
+  bool operator>(const device_iterator &it) const { return it < *this; }
+  bool operator<=(const device_iterator &it) const { return !(*this > it); }
+  bool operator>=(const device_iterator &it) const { return !(*this < it); }
+
+  std::size_t get_idx() const { return Base::idx; } // required
+
+  sycl::buffer<T, 1, Allocator> get_buffer() {
+    return Base::buffer;
+  } // required
+};
+#else
+template <typename T> class device_iterator : public device_pointer<T> {
+  using Base = device_pointer<T>;
+
+protected:
+  std::size_t idx;
+
+public:
+  using value_type = T;
+  using difference_type = std::make_signed<std::size_t>::type;
+  using pointer = typename Base::pointer;
+  using reference = typename Base::reference;
+  using iterator_category = std::random_access_iterator_tag;
+  using is_hetero = std::false_type;         // required
+  using is_passed_directly = std::true_type; // required
+  static constexpr sycl::access_mode mode =
+      sycl::access_mode::read_write; // required
+
+  device_iterator() : Base(nullptr), idx(0) {}
+  device_iterator(T *vec, std::size_t index) : Base(vec), idx(index) {}
+  device_iterator(const Base &dev_ptr) : Base(dev_ptr), idx(0) {}
+  template <sycl::access_mode inMode>
+  device_iterator(const device_iterator<T> &in)
+      : Base(in.ptr), idx(in.idx) {} // required for iter_mode
+  device_iterator &operator=(const device_iterator &in) {
+    Base::operator=(in);
+    idx = in.idx;
+    return *this;
+  }
+
+  reference operator*() const { return *(Base::ptr + idx); }
+
+  reference operator[](difference_type i) { return Base::ptr[idx + i]; }
+  reference operator[](difference_type i) const { return Base::ptr[idx + i]; }
+  device_iterator &operator++() {
+    ++idx;
+    return *this;
+  }
+  device_iterator &operator--() {
+    --idx;
+    return *this;
+  }
+  device_iterator operator++(int) {
+    device_iterator it(*this);
+    ++(*this);
+    return it;
+  }
+  device_iterator operator--(int) {
+    device_iterator it(*this);
+    --(*this);
+    return it;
+  }
+  device_iterator operator+(difference_type forward) const {
+    const auto new_idx = idx + forward;
+    return {Base::ptr, new_idx};
+  }
+  device_iterator &operator+=(difference_type forward) {
+    idx += forward;
+    return *this;
+  }
+  device_iterator operator-(difference_type backward) const {
+    return {Base::ptr, idx - backward};
+  }
+  device_iterator &operator-=(difference_type backward) {
+    idx -= backward;
+    return *this;
+  }
+  friend device_iterator operator+(difference_type forward,
+                                   const device_iterator &it) {
+    return it + forward;
+  }
+  difference_type operator-(const device_iterator &it) const {
+    return idx - it.idx;
+  }
+
+  template <typename OtherIterator>
+  typename std::enable_if<internal::is_hetero_iterator<OtherIterator>::value,
+                          difference_type>::type
+  operator-(const OtherIterator &it) const {
+    return idx - it.get_idx();
+  }
+
+  bool operator==(const device_iterator &it) const { return *this - it == 0; }
+  bool operator!=(const device_iterator &it) const { return !(*this == it); }
+  bool operator<(const device_iterator &it) const { return *this - it < 0; }
+  bool operator>(const device_iterator &it) const { return it < *this; }
+  bool operator<=(const device_iterator &it) const { return !(*this > it); }
+  bool operator>=(const device_iterator &it) const { return !(*this < it); }
+
+  std::size_t get_idx() const { return idx; } // required
+
+  device_iterator &get_buffer() { return *this; } // required
+
+  std::size_t size() const { return idx; }
+};
+#endif
+
+struct sys_tag {};
+struct device_sys_tag : public sys_tag {};
+struct host_sys_tag : public sys_tag {};
+
+#ifdef DPCT_USM_LEVEL_NONE
+template <typename T, typename Tag> class tagged_pointer {
+  static_assert(false,
+                "tagged_pointer is not supported with DPCT_USM_LEVEL_NONE");
+};
+template <typename PolicyOrTag, typename Pointer>
+void release_temporary_allocation(PolicyOrTag &&policy_or_tag, Pointer ptr) {
+  static_assert(
+      false,
+      "release_temporary_allocation is not supported with DPCT_USM_LEVEL_NONE");
+}
+template <typename T, typename PolicyOrTag, typename SizeType>
+auto get_temporary_allocation(PolicyOrTag &&policy_or_tag,
+                              SizeType num_elements) {
+  static_assert(
+      false,
+      "get_temporary_allocation is not supported with DPCT_USM_LEVEL_NONE");
+}
+template <typename PolicyOrTag>
+auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_bytes) {
+  static_assert(false, "malloc is not supported with DPCT_USM_LEVEL_NONE");
+}
+template <typename T, typename PolicyOrTag>
+auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_elements) {
+  static_assert(false, "malloc<T> is not supported with DPCT_USM_LEVEL_NONE");
+}
+template <typename PolicyOrTag, typename Pointer>
+void free(PolicyOrTag &&policy_or_tag, Pointer ptr) {
+  static_assert(false, "free is not supported with DPCT_USM_LEVEL_NONE");
+}
+#else
+namespace internal {
+
+// Utility that converts a policy to a tag or reflects a provided tag
+template <typename PolicyOrTag> struct policy_or_tag_to_tag {
+private:
+  using decayed_policy_or_tag_t = ::std::decay_t<PolicyOrTag>;
+  using policy_conversion = ::std::conditional_t<
+      !is_hetero_execution_policy<decayed_policy_or_tag_t>::value, host_sys_tag,
+      device_sys_tag>;
+  static constexpr bool is_policy_v =
+      oneapi::dpl::execution::is_execution_policy_v<decayed_policy_or_tag_t>;
+  static constexpr bool is_sys_tag_v = ::std::disjunction_v<
+      ::std::is_same<decayed_policy_or_tag_t, host_sys_tag>,
+      ::std::is_same<decayed_policy_or_tag_t, device_sys_tag>>;
+  static_assert(is_policy_v || is_sys_tag_v,
+                "Only oneDPL policies or system tags may be provided");
+
+public:
+  using type = ::std::conditional_t<is_policy_v, policy_conversion,
+                                    decayed_policy_or_tag_t>;
+};
+
+template <typename PolicyOrTag>
+using policy_or_tag_to_tag_t = typename policy_or_tag_to_tag<PolicyOrTag>::type;
+
+template <typename PolicyOrTag> struct is_host_policy_or_tag {
+private:
+  using tag_t = policy_or_tag_to_tag_t<PolicyOrTag>;
+
+public:
+  static constexpr bool value = ::std::is_same_v<tag_t, host_sys_tag>;
+};
+
+template <typename PolicyOrTag>
+inline constexpr bool is_host_policy_or_tag_v =
+    is_host_policy_or_tag<PolicyOrTag>::value;
+
+} // namespace internal
+
+// TODO: Make this class an iterator adaptor.
+// tagged_pointer provides a wrapper around a raw pointer type with a tag of the
+// location of the allocated memory. Standard pointer operations are supported
+// with this class.
+template <typename T, typename Tag> class tagged_pointer {
+public:
+  using value_type = T;
+  using difference_type = ::std::ptrdiff_t;
+  using pointer = T *;
+  using reference = T &;
+  using iterator_category = std::random_access_iterator_tag;
+  using is_hetero = ::std::false_type;
+  using is_passed_directly = std::true_type;
+
+  tagged_pointer() : m_ptr(nullptr) {}
+  tagged_pointer(T *ptr) : m_ptr(ptr) {}
+  T &operator[](difference_type idx) { return this->m_ptr[idx]; }
+  const T &operator[](difference_type idx) const { return this->m_ptr[idx]; }
+  tagged_pointer operator+(difference_type forward) const {
+    return tagged_pointer{this->m_ptr + forward};
+  }
+  tagged_pointer operator-(difference_type backward) const {
+    return tagged_pointer{this->m_ptr - backward};
+  }
+  operator const T *() const { return m_ptr; }
+  operator T *() { return m_ptr; }
+  T &operator*() { return *this->m_ptr; }
+  const T &operator*() const { return *this->m_ptr; }
+  T *operator->() { return this->m_ptr; }
+  const T *operator->() const { return this->m_ptr; }
+  tagged_pointer operator++(int) {
+    tagged_pointer p(this->m_ptr);
+    ++this->m_ptr;
+    return p;
+  }
+  tagged_pointer operator--(int) {
+    tagged_pointer p(this->m_ptr);
+    --this->m_ptr;
+    return p;
+  }
+  tagged_pointer &operator++() {
+    ++this->m_ptr;
+    return *this;
+  }
+  tagged_pointer &operator--() {
+    --this->m_ptr;
+    return *this;
+  }
+  difference_type operator-(const tagged_pointer &it) const {
+    return this->m_ptr - it.m_ptr;
+  }
+  tagged_pointer &operator+=(difference_type forward) {
+    this->m_ptr = this->m_ptr + forward;
+    return *this;
+  }
+  tagged_pointer &operator-=(difference_type backward) {
+    this->m_ptr = this->m_ptr - backward;
+    return *this;
+  }
+
+private:
+  T *m_ptr;
+};
+
+// Void specialization for tagged pointers. Iterator traits are not provided but
+// conversion to other non-void tagged pointers is allowed. Pointer arithmetic
+// is disallowed with this specialization.
+template <typename Tag> class tagged_pointer<void, Tag> {
+public:
+  using difference_type = ::std::ptrdiff_t;
+  using pointer = void *;
+  tagged_pointer() : m_ptr(nullptr) {}
+  tagged_pointer(pointer ptr) : m_ptr(ptr) {}
+  operator const void *() const { return m_ptr; }
+  operator void *() { return m_ptr; }
+  // Enable tagged void pointer to convert to all other raw pointer types.
+  template <typename OtherPtr> operator OtherPtr *() const {
+    return static_cast<OtherPtr *>(this->m_ptr);
+  }
+
+private:
+  void *m_ptr;
+};
+
+namespace internal {
+
+// Internal utility to return raw pointer to allocated memory. Note that host
+// allocations are not device accessible (not pinned).
+template <typename PolicyOrTag>
+void *malloc_base(PolicyOrTag &&policy_or_tag, const ::std::size_t num_bytes) {
+  using decayed_policy_or_tag_t = ::std::decay_t<PolicyOrTag>;
+  if constexpr (internal::is_host_policy_or_tag_v<PolicyOrTag>) {
+    return ::std::malloc(num_bytes);
+  } else {
+    sycl::queue q;
+    // Grab the associated queue if a device policy is provided. Otherwise, use
+    // default constructed.
+    if constexpr (oneapi::dpl::execution::is_execution_policy_v<
+                      decayed_policy_or_tag_t>) {
+      q = policy_or_tag.queue();
+    } else {
+      q = get_default_queue();
+    }
+    return sycl::malloc_shared(num_bytes, q);
+  }
+}
+
+} // namespace internal
+
+template <typename PolicyOrTag>
+auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_bytes) {
+  return tagged_pointer<void, internal::policy_or_tag_to_tag_t<PolicyOrTag>>(
+      internal::malloc_base(::std::forward<PolicyOrTag>(policy_or_tag),
+                            num_bytes));
+}
+
+template <typename T, typename PolicyOrTag>
+auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_elements) {
+  return tagged_pointer<T, internal::policy_or_tag_to_tag_t<PolicyOrTag>>(
+      static_cast<T *>(
+          internal::malloc_base(::std::forward<PolicyOrTag>(policy_or_tag),
+                                num_elements * sizeof(T))));
+}
+
+template <typename PolicyOrTag, typename Pointer>
+void free(PolicyOrTag &&policy_or_tag, Pointer ptr) {
+  using decayed_policy_or_tag_t = ::std::decay_t<PolicyOrTag>;
+  if constexpr (internal::is_host_policy_or_tag_v<PolicyOrTag>) {
+    ::std::free(ptr);
+  } else {
+    sycl::queue q;
+    // Grab the associated queue if a device policy is provided. Otherwise, use
+    // default constructed.
+    if constexpr (oneapi::dpl::execution::is_execution_policy_v<
+                      decayed_policy_or_tag_t>) {
+      q = policy_or_tag.queue();
+    } else {
+      q = get_default_queue();
+    }
+    sycl::free(ptr, q);
+  }
+}
+
+template <typename T, typename PolicyOrTag, typename SizeType>
+auto get_temporary_allocation(PolicyOrTag &&policy_or_tag,
+                              SizeType num_elements) {
+  auto allocation_ptr =
+      dpct::malloc<T>(::std::forward<PolicyOrTag>(policy_or_tag), num_elements);
+  if (allocation_ptr == nullptr)
+    return ::std::make_pair(allocation_ptr, SizeType(0));
+  return ::std::make_pair(allocation_ptr, num_elements);
+}
+
+template <typename PolicyOrTag, typename Pointer>
+void release_temporary_allocation(PolicyOrTag &&policy_or_tag, Pointer ptr) {
+  dpct::free(::std::forward<PolicyOrTag>(policy_or_tag), ptr);
+}
+#endif
+
+template <typename T>
+device_pointer<T> malloc_device(const std::size_t num_elements) {
+  return device_pointer<T>(num_elements * sizeof(T));
+}
+static inline device_pointer<void> malloc_device(const std::size_t num_bytes) {
+  return device_pointer<void>(num_bytes);
+}
+#ifdef DPCT_USM_LEVEL_NONE
+template <typename T>
+device_pointer<T> device_new(device_pointer<void> p, const T &value,
+                             const std::size_t count) {
+  auto converted_buf = p.buffer.template reinterpret<T>(sycl::range<1>(count));
+  ::std::uninitialized_fill(
+      oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
+      oneapi::dpl::begin(converted_buf),
+      oneapi::dpl::end(converted_buf), value);
+  return device_pointer<T>(converted_buf, p.idx);
+}
+// buffer manages lifetime
+template <typename T> void free_device(device_pointer<T> ptr) {}
+#else
+template <typename T>
+device_pointer<T> device_new(device_pointer<void> p, const T &value,
+                             const std::size_t count = 1) {
+  dpct::device_pointer<T> converted_p(static_cast<T *>(p.get()));
+  ::std::uninitialized_fill(
+      oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
+      converted_p, converted_p + count, value);
+  return converted_p;
+}
+template <typename T> void free_device(device_pointer<T> ptr) {
+  sycl::free(ptr.get(), dpct::get_default_queue());
+}
+#endif
+template <typename T>
+device_pointer<T> device_new(device_pointer<void> p,
+                             const std::size_t count = 1) {
+  return device_new(p, T{}, count);
+}
+template <typename T>
+device_pointer<T> device_new(const std::size_t count = 1) {
+  return device_new(device_pointer<void>(sizeof(T) * count), T{}, count);
+}
+
+template <typename T>
+typename std::enable_if<!std::is_trivially_destructible<T>::value, void>::type
+device_delete(device_pointer<T> p, const std::size_t count = 1) {
+  ::std::destroy(oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
+                 p, p + count);
+  free_device(p);
+}
+template <typename T>
+typename std::enable_if<std::is_trivially_destructible<T>::value, void>::type
+device_delete(device_pointer<T> p, const std::size_t count = 1) {
+  free_device(p);
+}
+
+template <typename T> device_pointer<T> get_device_pointer(T *ptr) {
+  return device_pointer<T>(ptr);
+}
+
+template <typename T>
+device_pointer<T> get_device_pointer(const device_pointer<T> &ptr) {
+  return device_pointer<T>(ptr);
+}
+
+template <typename T> T *get_raw_pointer(const device_pointer<T> &ptr) {
+  return ptr.get();
+}
+
+template <typename Pointer> Pointer get_raw_pointer(const Pointer &ptr) {
+  return ptr;
+}
+
+template <typename T> const T &get_raw_reference(const device_reference<T> &ref) {
+  return ref.value;
+}
+
+template <typename T> T &get_raw_reference(device_reference<T> &ref) {
+  return ref.value;
+}
+
+template <typename T> const T &get_raw_reference(const T &ref) {
+  return ref;
+}
+
+template <typename T> T &get_raw_reference(T &ref) {
+  return ref;
+}
+
+} // namespace dpct
+
+#endif
diff --git a/dpct/dpl_extras/numeric.h b/dpct/dpl_extras/numeric.h
new file mode 100644
index 0000000000000..9864cd17359f3
--- /dev/null
+++ b/dpct/dpl_extras/numeric.h
@@ -0,0 +1,32 @@
+//==---- numeric.h --------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_NUMERIC_H__
+#define __DPCT_NUMERIC_H__
+
+namespace dpct {
+
+template <typename Policy, typename InputIt1, typename InputIt2, typename T>
+T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
+                InputIt2 first2, T init) {
+  return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
+                               first2, init);
+}
+
+template <typename Policy, typename InputIt1, typename InputIt2, typename T,
+          typename BinaryOperation1, typename BinaryOperation2>
+T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
+                InputIt2 first2, T init, BinaryOperation1 op1,
+                BinaryOperation2 op2) {
+  return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
+                               first2, init, op1, op2);
+}
+
+} // end namespace dpct
+
+#endif
diff --git a/dpct/dpl_extras/vector.h b/dpct/dpl_extras/vector.h
new file mode 100644
index 0000000000000..afba575ae1da9
--- /dev/null
+++ b/dpct/dpl_extras/vector.h
@@ -0,0 +1,752 @@
+//==---- vector.h ---------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_VECTOR_H__
+#define __DPCT_VECTOR_H__
+
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/execution>
+
+#include <sycl/sycl.hpp>
+
+#include "memory.h"
+
+#include <algorithm>
+#include <iterator>
+#include <vector>
+
+#include "../device.hpp"
+
+namespace dpct {
+
+namespace internal {
+template <typename Iter, typename Void = void> // for non-iterators
+struct is_iterator : std::false_type {};
+
+template <typename Iter> // For iterators
+struct is_iterator<
+    Iter,
+    typename std::enable_if<
+        !std::is_void<typename Iter::iterator_category>::value, void>::type>
+    : std::true_type {};
+
+template <typename T> // For pointers
+struct is_iterator<T *> : std::true_type {};
+} // end namespace internal
+
+#ifndef DPCT_USM_LEVEL_NONE
+
+template <typename T,
+          typename Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>>
+class device_vector {
+public:
+  using iterator = device_iterator<T>;
+  using const_iterator = const iterator;
+  using reference = device_reference<T>;
+  using const_reference = const reference;
+  using value_type = T;
+  using pointer = T *;
+  using const_pointer = const T *;
+  using difference_type =
+      typename ::std::iterator_traits<iterator>::difference_type;
+  using size_type = ::std::size_t;
+
+private:
+  Allocator _alloc;
+  size_type _size;
+  size_type _capacity;
+  pointer _storage;
+
+  size_type _min_capacity() const { return size_type(1); }
+
+  void _set_capacity_and_alloc() {
+    _capacity = ::std::max(_size * 2, _min_capacity());
+    _storage = _alloc.allocate(_capacity);
+  }
+
+public:
+  template <typename OtherA> operator ::std::vector<T, OtherA>() const {
+    auto __tmp = ::std::vector<T, OtherA>(this->size());
+    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+              this->begin(), this->end(), __tmp.begin());
+    return __tmp;
+  }
+  device_vector()
+      : _alloc(get_default_queue()), _size(0), _capacity(_min_capacity()) {
+    _set_capacity_and_alloc();
+  }
+  ~device_vector() /*= default*/ { _alloc.deallocate(_storage, _capacity); };
+  explicit device_vector(size_type n) : device_vector(n, T()) {}
+  explicit device_vector(size_type n, const T &value)
+      : _alloc(get_default_queue()), _size(n) {
+    _set_capacity_and_alloc();
+    if (_size > 0) {
+      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                begin(), end(), T(value));
+    }
+  }
+  device_vector(const device_vector &other) : _alloc(get_default_queue()) {
+    _size = other.size();
+    _capacity = other.capacity();
+    _storage = _alloc.allocate(_capacity);
+    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+              other.begin(), other.end(), begin());
+  }
+  device_vector(device_vector &&other)
+      : _alloc(get_default_queue()), _size(other.size()),
+        _capacity(other.capacity()), _storage(other._storage) {
+    other._size = 0;
+    other._capacity = 0; 
+    other._storage = nullptr;
+  }
+
+  template <typename InputIterator>
+  device_vector(InputIterator first,
+                typename ::std::enable_if<
+                    internal::is_iterator<InputIterator>::value &&
+                        !::std::is_pointer<InputIterator>::value &&
+                        ::std::is_same<typename ::std::iterator_traits<
+                                         InputIterator>::iterator_category,
+                                     ::std::random_access_iterator_tag>::value,
+                    InputIterator>::type last)
+      : _alloc(get_default_queue()) {
+    _size = ::std::distance(first, last);
+    _set_capacity_and_alloc();
+    if (_size > 0) {
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                first, last, begin());
+    }
+  }
+
+  template <typename InputIterator>
+  device_vector(InputIterator first,
+                typename ::std::enable_if<::std::is_pointer<InputIterator>::value,
+                                        InputIterator>::type last)
+      : _alloc(get_default_queue()) {
+    _size = ::std::distance(first, last);
+    _set_capacity_and_alloc();
+    if (_size > 0) {
+      auto ptr_type = sycl::get_pointer_type(first, get_default_context());
+      if (ptr_type != sycl::usm::alloc::host &&
+          ptr_type != sycl::usm::alloc::unknown) {
+        ::std::copy(
+            oneapi::dpl::execution::make_device_policy(get_default_queue()),
+            first, last, begin());
+      } else {
+        sycl::buffer<typename ::std::iterator_traits<InputIterator>::value_type,
+                     1>
+            buf(first, last);
+        auto buf_first = oneapi::dpl::begin(buf);
+        auto buf_last = oneapi::dpl::end(buf);
+        ::std::copy(
+            oneapi::dpl::execution::make_device_policy(get_default_queue()),
+            buf_first, buf_last, begin());
+      }
+    }
+  }
+
+  template <typename InputIterator>
+  device_vector(InputIterator first,
+                typename ::std::enable_if<
+                    internal::is_iterator<InputIterator>::value &&
+                        !::std::is_pointer<InputIterator>::value &&
+                        !::std::is_same<typename ::std::iterator_traits<
+                                          InputIterator>::iterator_category,
+                                      ::std::random_access_iterator_tag>::value,
+                    InputIterator>::type last)
+      : _alloc(get_default_queue()), _size(::std::distance(first, last)) {
+    _set_capacity_and_alloc();
+    ::std::vector<T> _tmp(first, last);
+    if (_size > 0) {
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                _tmp.begin(), _tmp.end(), this->begin());
+    }
+  }
+
+  template <typename OtherAllocator>
+  device_vector(const device_vector<T, OtherAllocator> &v)
+      : _alloc(get_default_queue()), _storage(v.real_begin()), _size(v.size()),
+        _capacity(v.capacity()) {}
+
+  template <typename OtherAllocator>
+  device_vector(::std::vector<T, OtherAllocator> &v)
+      : _alloc(get_default_queue()), _size(v.size()) {
+    _set_capacity_and_alloc();
+    if (_size > 0) {
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                v.begin(), v.end(), this->begin());
+    }
+  }
+
+  template <typename OtherAllocator>
+  device_vector &operator=(const ::std::vector<T, OtherAllocator> &v) {
+    resize(v.size());
+    if (_size > 0) {
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                v.begin(), v.end(), begin());
+    }
+    return *this;
+  }
+  device_vector &operator=(const device_vector &other) {
+    // Copy assignment operator:
+    resize(other.size());
+    if (_size > 0) {
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                other.begin(), other.end(), begin());
+    }
+    return *this;
+  }
+  device_vector &operator=(device_vector &&other) {
+    // Move assignment operator:
+    device_vector dummy(::std::move(other));
+    this->swap(dummy);
+    return *this;
+  }
+  size_type size() const { return _size; }
+  iterator begin() noexcept { return device_iterator<T>(_storage, 0); }
+  iterator end() { return device_iterator<T>(_storage, size()); }
+  const_iterator begin() const noexcept {
+    return device_iterator<T>(_storage, 0);
+  }
+  const_iterator cbegin() const noexcept { return begin(); }
+  const_iterator end() const { return device_iterator<T>(_storage, size()); }
+  const_iterator cend() const { return end(); }
+  T *real_begin() { return _storage; }
+  const T *real_begin() const { return _storage; }
+  void swap(device_vector &v) {
+    ::std::swap(_size, v._size);
+    ::std::swap(_capacity, v._capacity);
+    ::std::swap(_storage, v._storage);
+    ::std::swap(_alloc, v._alloc);
+  }
+  reference operator[](size_type n) { return _storage[n]; }
+  const_reference operator[](size_type n) const { return _storage[n]; }
+  void reserve(size_type n) {
+    if (n > capacity()) {
+      // allocate buffer for new size
+      auto tmp = _alloc.allocate(2 * n);
+      // copy content (old buffer to new buffer)
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                begin(), end(), tmp);
+      // deallocate old memory
+      _alloc.deallocate(_storage, _capacity);
+      _storage = tmp;
+      _capacity = 2 * n;
+    }
+  }
+  void resize(size_type new_size, const T &x = T()) {
+    reserve(new_size);
+    if (_size < new_size) {
+      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                begin() + _size, begin() + new_size, x);
+    }
+    _size = new_size;
+  }
+  size_type max_size(void) const {
+    return ::std::numeric_limits<size_type>::max() / sizeof(T);
+  }
+  size_type capacity() const { return _capacity; }
+  const_reference front() const { return *begin(); }
+  reference front() { return *begin(); }
+  const_reference back(void) const { return *(end() - 1); }
+  reference back(void) { return *(end() - 1); }
+  pointer data(void) { return _storage; }
+  const_pointer data(void) const { return _storage; }
+  void shrink_to_fit(void) {
+    if (_size != capacity()) {
+      size_type tmp_capacity = ::std::max(_size, _min_capacity());
+      auto tmp = _alloc.allocate(tmp_capacity);
+      if (_size > 0) {
+        ::std::copy(
+            oneapi::dpl::execution::make_device_policy(get_default_queue()),
+            begin(), end(), tmp);
+      }
+      _alloc.deallocate(_storage, _capacity);
+      _storage = tmp;
+      _capacity = tmp_capacity;
+    }
+  }
+  void assign(size_type n, const T &x) {
+    resize(n);
+    if (_size > 0) {
+      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                begin(), begin() + n, x);
+    }
+  }
+  template <typename InputIterator>
+  void
+  assign(InputIterator first,
+         typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
+                                 InputIterator>::type last) {
+    auto n = ::std::distance(first, last);
+    resize(n);
+    if (_size > 0) {
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                first, last, begin());
+    }
+  }
+  void clear(void) { _size = 0; }
+  bool empty(void) const { return (size() == 0); }
+  void push_back(const T &x) { insert(end(), size_type(1), x); }
+  void pop_back(void) {
+    if (_size > 0)
+      --_size;
+  }
+  iterator erase(iterator first, iterator last) {
+    auto n = ::std::distance(first, last);
+    if (last == end()) {
+      _size = _size - n;
+      return end();
+    }
+    auto m = ::std::distance(last, end());
+    if (m <= 0) {
+      return end();
+    }
+    auto tmp = _alloc.allocate(m);
+    // copy remainder to temporary buffer.
+    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+              last, end(), tmp);
+    // override (erase) subsequence in storage.
+    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+              tmp, tmp + m, first);
+    _alloc.deallocate(tmp, m);
+    _size -= n;
+    return begin() + first.get_idx() + n;
+  }
+  iterator erase(iterator pos) { return erase(pos, pos + 1); }
+  iterator insert(iterator position, const T &x) {
+    auto n = ::std::distance(begin(), position);
+    insert(position, size_type(1), x);
+    return begin() + n;
+  }
+  void insert(iterator position, size_type n, const T &x) {
+    if (position == end()) {
+      resize(size() + n);
+      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                end() - n, end(), x);
+    } else {
+      auto i_n = ::std::distance(begin(), position);
+      // allocate temporary storage
+      auto m = ::std::distance(position, end());
+      // will throw if position is not inside active vector
+      auto tmp = _alloc.allocate(m);
+      // copy remainder
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                position, end(), tmp);
+
+      resize(size() + n);
+      // resizing might invalidate position
+      position = begin() + position.get_idx();
+
+      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                position, position + n, x);
+
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                tmp, tmp + m, position + n);
+      _alloc.deallocate(tmp, m);
+    }
+  }
+  template <typename InputIterator>
+  void
+  insert(iterator position, InputIterator first,
+         typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
+                                 InputIterator>::type last) {
+    auto n = ::std::distance(first, last);
+    if (position == end()) {
+      resize(size() + n);
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                first, last, end());
+    } else {
+      auto m = ::std::distance(position, end());
+      // will throw if position is not inside active vector
+      auto tmp = _alloc.allocate(m);
+
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                position, end(), tmp);
+
+      resize(size() + n);
+      // resizing might invalidate position
+      position = begin() + position.get_idx();
+
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                first, last, position);
+      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+                tmp, tmp + m, position + n);
+      _alloc.deallocate(tmp, m);
+    }
+  }
+  Allocator get_allocator() const { return _alloc; }
+};
+
+#else
+
+template <typename T, typename Allocator = detail::__buffer_allocator<T>>
+class device_vector {
+  static_assert(
+      std::is_same<Allocator, detail::__buffer_allocator<T>>::value,
+      "device_vector doesn't support custom allocator when USM is not used.");
+
+public:
+  using iterator = device_iterator<T>;
+  using const_iterator = const iterator;
+  using reference = device_reference<T>;
+  using const_reference = const reference;
+  using value_type = T;
+  using pointer = T *;
+  using const_pointer = const T *;
+  using difference_type =
+      typename std::iterator_traits<iterator>::difference_type;
+  using size_type = std::size_t;
+
+private:
+  using Buffer = sycl::buffer<T, 1>;
+  using Range = sycl::range<1>;
+  // Using mem_mgr to handle memory allocation
+  void *_storage;
+  size_type _size;
+
+  size_type _min_capacity() const { return size_type(1); }
+
+  void *alloc_store(size_type num_bytes) {
+    return detail::mem_mgr::instance().mem_alloc(num_bytes);
+  }
+
+public:
+  template <typename OtherA> operator std::vector<T, OtherA>() const {
+    auto __tmp = std::vector<T, OtherA>(this->size());
+    std::copy(oneapi::dpl::execution::dpcpp_default, this->begin(), this->end(),
+              __tmp.begin());
+    return __tmp;
+  }
+  device_vector()
+      : _storage(alloc_store(_min_capacity() * sizeof(T))), _size(0) {}
+  ~device_vector() = default;
+  explicit device_vector(size_type n) : device_vector(n, T()) {}
+  explicit device_vector(size_type n, const T &value)
+      : _storage(alloc_store(std::max(n, _min_capacity()) * sizeof(T))),
+        _size(n) {
+    auto buf = get_buffer();
+    std::fill(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(buf),
+              oneapi::dpl::begin(buf) + n, T(value));
+  }
+  device_vector(const device_vector &other)
+      : _storage(other._storage), _size(other.size()) {}
+  device_vector(device_vector &&other)
+      : _storage(std::move(other._storage)), _size(other.size()) {}
+
+  template <typename InputIterator>
+  device_vector(InputIterator first,
+                typename std::enable_if<
+                    internal::is_iterator<InputIterator>::value &&
+                        !std::is_pointer<InputIterator>::value &&
+                        std::is_same<typename std::iterator_traits<
+                                         InputIterator>::iterator_category,
+                                     std::random_access_iterator_tag>::value,
+                    InputIterator>::type last)
+      : _storage(alloc_store(std::distance(first, last) * sizeof(T))),
+        _size(std::distance(first, last)) {
+    auto buf = get_buffer();
+    auto dst = oneapi::dpl::begin(buf);
+    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+              first, last, dst);
+  }
+
+  template <typename InputIterator>
+  device_vector(InputIterator first,
+                typename std::enable_if<std::is_pointer<InputIterator>::value,
+                                        InputIterator>::type last)
+      : _storage(alloc_store(std::distance(first, last) * sizeof(T))),
+        _size(std::distance(first, last)) {
+    auto buf = get_buffer();
+    Buffer tmp_buf(first, last);
+    auto start = oneapi::dpl::begin(tmp_buf);
+    auto end = oneapi::dpl::end(tmp_buf);
+    auto dst = oneapi::dpl::begin(buf);
+    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+              start, end, dst);
+  }
+
+  template <typename InputIterator>
+  device_vector(InputIterator first,
+                typename std::enable_if<
+                    internal::is_iterator<InputIterator>::value &&
+                        !std::is_same<typename std::iterator_traits<
+                                          InputIterator>::iterator_category,
+                                      std::random_access_iterator_tag>::value,
+                    InputIterator>::type last)
+      : _storage(alloc_store(std::distance(first, last) * sizeof(T))),
+        _size(std::distance(first, last)) {
+    auto buf = get_buffer();
+    std::vector<T> tmp(first, last);
+    Buffer tmp_buf(tmp);
+    auto start = oneapi::dpl::begin(tmp_buf);
+    auto end = oneapi::dpl::end(tmp_buf);
+    auto dst = oneapi::dpl::begin(buf);
+    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+              start, end, dst);
+  }
+
+  template <typename OtherAllocator>
+  device_vector(const device_vector<T, OtherAllocator> &v)
+      : _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
+    auto buf = get_buffer();
+    auto dst = oneapi::dpl::begin(buf);
+    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
+              v.real_begin(), v.real_begin() + v.size(), dst);
+  }
+
+  template <typename OtherAllocator>
+  device_vector(std::vector<T, OtherAllocator> &v)
+      : _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
+    std::copy(oneapi::dpl::execution::dpcpp_default, v.begin(), v.end(),
+              oneapi::dpl::begin(get_buffer()));
+  }
+
+  device_vector &operator=(const device_vector &other) {
+    // Copy assignment operator:
+    _size = other.size();
+    void *tmp = alloc_store(_size * sizeof(T));
+    auto tmp_buf =
+        detail::mem_mgr::instance()
+            .translate_ptr(tmp)
+            .buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
+    std::copy(oneapi::dpl::execution::dpcpp_default,
+              oneapi::dpl::begin(other.get_buffer()),
+              oneapi::dpl::end(other.get_buffer()),
+              oneapi::dpl::begin(tmp_buf));
+    detail::mem_mgr::instance().mem_free(_storage);
+    _storage = tmp;
+    return *this;
+  }
+  device_vector &operator=(device_vector &&other) {
+    // Move assignment operator:
+    _size = other.size();
+    this->_storage = std::move(other._storage);
+    return *this;
+  }
+  template <typename OtherAllocator>
+  device_vector &operator=(const std::vector<T, OtherAllocator> &v) {
+    Buffer data(v.begin(), v.end());
+    _size = v.size();
+    void *tmp = alloc_store(_size * sizeof(T));
+    auto tmp_buf =
+        detail::mem_mgr::instance()
+            .translate_ptr(tmp)
+            .buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
+    std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(data),
+              oneapi::dpl::end(data), oneapi::dpl::begin(tmp_buf));
+    detail::mem_mgr::instance().mem_free(_storage);
+    _storage = tmp;
+
+    return *this;
+  }
+  Buffer get_buffer() const {
+    return detail::mem_mgr::instance()
+        .translate_ptr(_storage)
+        .buffer.template reinterpret<T, 1>(sycl::range<1>(capacity()));
+  }
+  size_type size() const { return _size; }
+  iterator begin() noexcept { return device_iterator<T>(get_buffer(), 0); }
+  iterator end() { return device_iterator<T>(get_buffer(), _size); }
+  const_iterator begin() const noexcept {
+    return device_iterator<T>(get_buffer(), 0);
+  }
+  const_iterator cbegin() const noexcept { return begin(); }
+  const_iterator end() const { return device_iterator<T>(get_buffer(), _size); }
+  const_iterator cend() const { return end(); }
+  T *real_begin() {
+    return (detail::mem_mgr::instance()
+                .translate_ptr(_storage)
+                .buffer.template get_access<sycl::access_mode::read_write>())
+        .get_pointer();
+  }
+  const T *real_begin() const {
+    return const_cast<device_vector *>(this)
+        ->detail::mem_mgr::instance()
+        .translate_ptr(_storage)
+        .buffer.template get_access<sycl::access_mode::read_write>()
+        .get_pointer();
+  }
+  void swap(device_vector &v) {
+    void *temp = v._storage;
+    v._storage = this->_storage;
+    this->_storage = temp;
+    std::swap(_size, v._size);
+  }
+  reference operator[](size_type n) { return *(begin() + n); }
+  const_reference operator[](size_type n) const { return *(begin() + n); }
+  void reserve(size_type n) {
+    if (n > capacity()) {
+      // create new buffer (allocate for new size)
+      void *a = alloc_store(n * sizeof(T));
+
+      // copy content (old buffer to new buffer)
+      if (_storage != nullptr) {
+        auto tmp = detail::mem_mgr::instance()
+                       .translate_ptr(a)
+                       .buffer.template reinterpret<T, 1>(sycl::range<1>(n));
+        auto src_buf = get_buffer();
+        std::copy(oneapi::dpl::execution::dpcpp_default,
+                  oneapi::dpl::begin(src_buf), oneapi::dpl::end(src_buf),
+                  oneapi::dpl::begin(tmp));
+
+        // deallocate old memory
+        detail::mem_mgr::instance().mem_free(_storage);
+      }
+      _storage = a;
+    }
+  }
+  void resize(size_type new_size, const T &x = T()) {
+    reserve(new_size);
+    if (_size < new_size) {
+      auto src_buf = get_buffer();
+      std::fill(oneapi::dpl::execution::dpcpp_default,
+                oneapi::dpl::begin(src_buf) + _size,
+                oneapi::dpl::begin(src_buf) + new_size, x);
+    }
+    _size = new_size;
+  }
+  size_type max_size(void) const {
+    return std::numeric_limits<size_type>::max() / sizeof(T);
+  }
+  size_type capacity() const {
+    return _storage != nullptr ? detail::mem_mgr::instance()
+                                         .translate_ptr(_storage)
+                                         .buffer.size() /
+                                     sizeof(T)
+                               : 0;
+  }
+  const_reference front() const { return *begin(); }
+  reference front() { return *begin(); }
+  const_reference back(void) const { return *(end() - 1); }
+  reference back(void) { return *(end() - 1); }
+  pointer data(void) { return reinterpret_cast<pointer>(_storage); }
+  const_pointer data(void) const {
+    return reinterpret_cast<const_pointer>(_storage);
+  }
+  void shrink_to_fit(void) {
+    if (_size != capacity()) {
+      void *a = alloc_store(_size * sizeof(T));
+      auto tmp = detail::mem_mgr::instance()
+                     .translate_ptr(a)
+                     .buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
+      std::copy(oneapi::dpl::execution::dpcpp_default,
+                oneapi::dpl::begin(get_buffer()),
+                oneapi::dpl::begin(get_buffer()) + _size,
+                oneapi::dpl::begin(tmp));
+      detail::mem_mgr::instance().mem_free(_storage);
+      _storage = a;
+    }
+  }
+  void assign(size_type n, const T &x) {
+    resize(n);
+    std::fill(oneapi::dpl::execution::dpcpp_default, begin(), begin() + n, x);
+  }
+  template <typename InputIterator>
+  void
+  assign(InputIterator first,
+         typename std::enable_if<internal::is_iterator<InputIterator>::value,
+                                 InputIterator>::type last) {
+    auto n = std::distance(first, last);
+    resize(n);
+    if (internal::is_iterator<InputIterator>::value &&
+        !std::is_pointer<InputIterator>::value)
+      std::copy(oneapi::dpl::execution::dpcpp_default, first, last, begin());
+    else {
+      Buffer tmp(first, last);
+      std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
+                oneapi::dpl::end(tmp), begin());
+    }
+  }
+  void clear(void) {
+    _size = 0;
+    detail::mem_mgr::instance().mem_free(_storage);
+    _storage = nullptr;
+  }
+  bool empty(void) const { return (size() == 0); }
+  void push_back(const T &x) { insert(end(), size_type(1), x); }
+  void pop_back(void) {
+    if (_size > 0)
+      --_size;
+  }
+  iterator erase(iterator first, iterator last) {
+    auto n = std::distance(first, last);
+    if (last == end()) {
+      _size = _size - n;
+      return end();
+    }
+    Buffer tmp{Range(std::distance(last, end()))};
+    // copy remainder to temporary buffer.
+    std::copy(oneapi::dpl::execution::dpcpp_default, last, end(),
+              oneapi::dpl::begin(tmp));
+    // override (erase) subsequence in storage.
+    std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
+              oneapi::dpl::end(tmp), first);
+    resize(_size - n);
+    return begin() + first.get_idx() + n;
+  }
+  iterator erase(iterator pos) { return erase(pos, pos + 1); }
+  iterator insert(iterator position, const T &x) {
+    auto n = std::distance(begin(), position);
+    insert(position, size_type(1), x);
+    return begin() + n;
+  }
+  void insert(iterator position, size_type n, const T &x) {
+    if (position == end()) {
+      resize(size() + n);
+      std::fill(oneapi::dpl::execution::dpcpp_default, end() - n, end(), x);
+    } else {
+      auto i_n = std::distance(begin(), position);
+      // allocate temporary storage
+      Buffer tmp{Range(std::distance(position, end()))};
+      // copy remainder
+      std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
+                oneapi::dpl::begin(tmp));
+
+      resize(size() + n);
+      // resizing might invalidate position
+      position = begin() + position.get_idx();
+
+      std::fill(oneapi::dpl::execution::dpcpp_default, position, position + n,
+                x);
+
+      std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
+                oneapi::dpl::end(tmp), position + n);
+    }
+  }
+  template <typename InputIterator>
+  void
+  insert(iterator position, InputIterator first,
+         typename std::enable_if<internal::is_iterator<InputIterator>::value,
+                                 InputIterator>::type last) {
+    auto n = std::distance(first, last);
+    if (position == end()) {
+      resize(size() + n);
+      std::copy(oneapi::dpl::execution::dpcpp_default, first, last, end());
+    } else {
+      Buffer tmp{Range(std::distance(position, end()))};
+
+      std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
+                oneapi::dpl::begin(tmp));
+
+      resize(size() + n);
+      // resizing might invalidate position
+      position = begin() + position.get_idx();
+
+      std::copy(oneapi::dpl::execution::dpcpp_default, first, last, position);
+      std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
+                oneapi::dpl::end(tmp), position + n);
+    }
+  }
+};
+
+#endif
+
+} // end namespace dpct
+
+#endif
diff --git a/dpct/dpl_utils.hpp b/dpct/dpl_utils.hpp
new file mode 100644
index 0000000000000..79a6e74048f33
--- /dev/null
+++ b/dpct/dpl_utils.hpp
@@ -0,0 +1,26 @@
+//==---- dpl_utils.hpp ----------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_DPL_UTILS_HPP__
+#define __DPCT_DPL_UTILS_HPP__
+
+#define ONEDPL_USE_DPCPP_BACKEND 1
+#define __USE_DPCT 1
+
+#include <oneapi/dpl/execution>
+#include <oneapi/dpl/algorithm>
+#include <oneapi/dpl/numeric>
+
+#include "dpl_extras/memory.h"
+#include "dpl_extras/algorithm.h"
+#include "dpl_extras/numeric.h"
+#include "dpl_extras/iterators.h"
+#include "dpl_extras/vector.h"
+#include "dpl_extras/dpcpp_extensions.h"
+
+#endif // __DPCT_DPL_UTILS_HPP__
diff --git a/dpct/fft_utils.hpp b/dpct/fft_utils.hpp
new file mode 100644
index 0000000000000..cba1b253cecaf
--- /dev/null
+++ b/dpct/fft_utils.hpp
@@ -0,0 +1,1376 @@
+//==---- fft_utils.hpp ----------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_FFT_UTILS_HPP__
+#define __DPCT_FFT_UTILS_HPP__
+
+#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#include <optional>
+#include <utility>
+#include "lib_common_utils.hpp"
+
+namespace dpct {
+namespace fft {
+/// An enumeration type to describe the FFT direction is forward or backward.
+enum fft_direction : int {
+  forward = 0,
+  backward
+};
+/// An enumeration type to describe the types of FFT input and output data.
+enum fft_type : int {
+  real_float_to_complex_float = 0,
+  complex_float_to_real_float,
+  real_double_to_complex_double,
+  complex_double_to_real_double,
+  complex_float_to_complex_float,
+  complex_double_to_complex_double,
+};
+
+/// A class to perform FFT calculation.
+class fft_engine {
+public:
+  /// Default constructor.
+  fft_engine() {}
+  /// Commit the configuration to calculate n-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] dim Dimension number of the data.
+  /// \param [in] n Pointer to an array containing each dimension's size.
+  /// \param [in] inembed Pointer to an array containing each dimension's size
+  /// of the embedded input data.
+  /// \param [in] istride Stride size of the input data.
+  /// \param [in] idist Distance between the two batches of the input data.
+  /// \param [in] input_type Input data type.
+  /// \param [in] onembed Pointer to an array containing each dimension's size
+  /// of the embedded output data.
+  /// \param [in] ostride Stride size of the output data.
+  /// \param [in] odist Distance between the two batches of the output data.
+  /// \param [in] output_type Output data type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [out] scratchpad_size The workspace size required for this FFT.
+  /// If this value is used to allocate memory, \p direction_and_placement need
+  /// to be specified explicitly to get correct result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  void commit(sycl::queue *exec_queue, int dim, long long *n,
+              long long *inembed, long long istride, long long idist,
+              library_data_t input_type, long long *onembed, long long ostride,
+              long long odist, library_data_t output_type, long long batch,
+              size_t *scratchpad_size,
+              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                  direction_and_placement = std::nullopt) {
+    _q = exec_queue;
+    init<long long>(dim, n, inembed, istride, idist, input_type, onembed,
+                    ostride, odist, output_type, batch,
+                    direction_and_placement);
+    if (scratchpad_size) {
+      if (_is_estimate_call)
+        *scratchpad_size = _workspace_estimate_bytes;
+      else
+        *scratchpad_size = _workspace_bytes;
+    }
+  }
+  /// Commit the configuration to calculate n-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] dim Dimension number of the data.
+  /// \param [in] n Pointer to an array containing each dimension's size.
+  /// \param [in] inembed Pointer to an array containing each dimension's size
+  /// of the embedded input data.
+  /// \param [in] istride Stride size of the input data.
+  /// \param [in] idist Distance between the two batches of the input data.
+  /// \param [in] input_type Input data type.
+  /// \param [in] onembed Pointer to an array containing each dimension's size
+  /// of the embedded output data.
+  /// \param [in] ostride Stride size of the output data.
+  /// \param [in] odist Distance between the two batches of the output data.
+  /// \param [in] output_type Output data type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [out] scratchpad_size The workspace size required for this FFT.
+  /// If this value is used to allocate memory, \p direction_and_placement need
+  /// to be specified explicitly to get correct result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  void commit(sycl::queue *exec_queue, int dim, int *n, int *inembed,
+              int istride, int idist, library_data_t input_type, int *onembed,
+              int ostride, int odist, library_data_t output_type, int batch,
+              size_t *scratchpad_size,
+              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                  direction_and_placement = std::nullopt) {
+    _q = exec_queue;
+    init<int>(dim, n, inembed, istride, idist, input_type, onembed, ostride,
+              odist, output_type, batch, direction_and_placement);
+    if (scratchpad_size) {
+      if (_is_estimate_call)
+        *scratchpad_size = _workspace_estimate_bytes;
+      else
+        *scratchpad_size = _workspace_bytes;
+    }
+  }
+  /// Commit the configuration to calculate n-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] dim Dimension number of the data.
+  /// \param [in] n Pointer to an array containing each dimension's size.
+  /// \param [in] inembed Pointer to an array containing each dimension's size
+  /// of the embedded input data.
+  /// \param [in] istride Stride size of the input data.
+  /// \param [in] idist Distance between the two batches of the input data.
+  /// \param [in] onembed Pointer to an array containing each dimension's size
+  /// of the embedded output data.
+  /// \param [in] ostride Stride size of the output data.
+  /// \param [in] odist Distance between the two batches of the output data.
+  /// \param [in] type The FFT type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [out] scratchpad_size The workspace size required for this FFT.
+  /// If this value is used to allocate memory, \p direction_and_placement need
+  /// to be specified explicitly to get correct result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  void commit(sycl::queue *exec_queue, int dim, long long *n,
+              long long *inembed, long long istride, long long idist,
+              long long *onembed, long long ostride, long long odist,
+              fft_type type, long long batch, size_t *scratchpad_size,
+              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                  direction_and_placement = std::nullopt) {
+    commit(exec_queue, dim, n, inembed, istride, idist,
+           fft_type_to_data_type(type).first, onembed, ostride, odist,
+           fft_type_to_data_type(type).second, batch, scratchpad_size,
+           direction_and_placement);
+  }
+  /// Commit the configuration to calculate n-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] dim Dimension number of the data.
+  /// \param [in] n Pointer to an array containing each dimension's size.
+  /// \param [in] inembed Pointer to an array containing each dimension's size
+  /// of the embedded input data.
+  /// \param [in] istride Stride size of the input data.
+  /// \param [in] idist Distance between the two batches of the input data.
+  /// \param [in] onembed Pointer to an array containing each dimension's size
+  /// of the embedded output data.
+  /// \param [in] ostride Stride size of the output data.
+  /// \param [in] odist Distance between the two batches of the output data.
+  /// \param [in] type The FFT type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [out] scratchpad_size The workspace size required for this FFT.
+  /// If this value is used to allocate memory, \p direction_and_placement need
+  /// to be specified explicitly to get correct result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  void commit(sycl::queue *exec_queue, int dim, int *n, int *inembed,
+              int istride, int idist, int *onembed, int ostride, int odist,
+              fft_type type, int batch, size_t *scratchpad_size,
+              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                  direction_and_placement = std::nullopt) {
+    commit(exec_queue, dim, n, inembed, istride, idist,
+           fft_type_to_data_type(type).first, onembed, ostride, odist,
+           fft_type_to_data_type(type).second, batch, scratchpad_size,
+           direction_and_placement);
+  }
+  /// Commit the configuration to calculate 1-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] n1 The size of the dimension of the data.
+  /// \param [in] type The FFT type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [out] scratchpad_size The workspace size required for this FFT.
+  /// If this value is used to allocate memory, \p direction_and_placement need
+  /// to be specified explicitly to get correct result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  void commit(sycl::queue *exec_queue, int n1, fft_type type, int batch,
+              size_t *scratchpad_size,
+              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                  direction_and_placement = std::nullopt) {
+    _q = exec_queue;
+    _n.resize(1);
+    _n[0] = n1;
+    std::tie(_input_type, _output_type) = fft_type_to_data_type(type);
+    _dim = 1;
+    _batch = batch;
+    _is_basic = true;
+    if (direction_and_placement.has_value()) {
+      _is_user_specified_dir_and_placement = true;
+      _direction = direction_and_placement->first;
+      _is_inplace = direction_and_placement->second;
+    }
+    config_and_commit_basic();
+    if (scratchpad_size) {
+      if (_is_estimate_call)
+        *scratchpad_size = _workspace_estimate_bytes;
+      else
+        *scratchpad_size = _workspace_bytes;
+    }
+  }
+  /// Commit the configuration to calculate 2-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] n2 The size of the 2nd dimension (outermost) of the data.
+  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
+  /// \param [in] type The FFT type.
+  /// \param [out] scratchpad_size The workspace size required for this FFT.
+  /// If this value is used to allocate memory, \p direction_and_placement need
+  /// to be specified explicitly to get correct result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  void commit(sycl::queue *exec_queue, int n2, int n1, fft_type type,
+              size_t *scratchpad_size,
+              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                  direction_and_placement = std::nullopt) {
+    _q = exec_queue;
+    _n.resize(2);
+    _n[0] = n2;
+    _n[1] = n1;
+    std::tie(_input_type, _output_type) = fft_type_to_data_type(type);
+    _dim = 2;
+    _is_basic = true;
+    if (direction_and_placement.has_value()) {
+      _is_user_specified_dir_and_placement = true;
+      _direction = direction_and_placement->first;
+      _is_inplace = direction_and_placement->second;
+    }
+    config_and_commit_basic();
+    if (scratchpad_size) {
+      if (_is_estimate_call)
+        *scratchpad_size = _workspace_estimate_bytes;
+      else
+        *scratchpad_size = _workspace_bytes;
+    }
+  }
+  /// Commit the configuration to calculate 3-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] n3 The size of the 3rd dimension (outermost) of the data.
+  /// \param [in] n2 The size of the 2nd dimension of the data.
+  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
+  /// \param [in] type The FFT type.
+  /// \param [out] scratchpad_size The workspace size required for this FFT.
+  /// If this value is used to allocate memory, \p direction_and_placement need
+  /// to be specified explicitly to get correct result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  void commit(sycl::queue *exec_queue, int n3, int n2, int n1, fft_type type,
+              size_t *scratchpad_size,
+              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                  direction_and_placement = std::nullopt) {
+    _q = exec_queue;
+    _n.resize(3);
+    _n[0] = n3;
+    _n[1] = n2;
+    _n[2] = n1;
+    std::tie(_input_type, _output_type) = fft_type_to_data_type(type);
+    _dim = 3;
+    _is_basic = true;
+    if (direction_and_placement.has_value()) {
+      _is_user_specified_dir_and_placement = true;
+      _direction = direction_and_placement->first;
+      _is_inplace = direction_and_placement->second;
+    }
+    config_and_commit_basic();
+    if (scratchpad_size) {
+      if (_is_estimate_call)
+        *scratchpad_size = _workspace_estimate_bytes;
+      else
+        *scratchpad_size = _workspace_bytes;
+    }
+  }
+
+  /// Create the class for calculate 1-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] n1 The size of the dimension of the data.
+  /// \param [in] type The FFT type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  static fft_engine *
+  create(sycl::queue *exec_queue, int n1, fft_type type, int batch,
+         std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+             direction_and_placement = std::nullopt) {
+    fft_engine *engine = new fft_engine();
+    engine->commit(exec_queue, n1, type, batch, nullptr,
+                   direction_and_placement);
+    return engine;
+  }
+  /// Create the class for calculate 2-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] n2 The size of the 2nd dimension (outermost) of the data.
+  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
+  /// \param [in] type The FFT type.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  static fft_engine *
+  create(sycl::queue *exec_queue, int n2, int n1, fft_type type,
+         std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+             direction_and_placement = std::nullopt) {
+    fft_engine *engine = new fft_engine();
+    engine->commit(exec_queue, n2, n1, type, nullptr, direction_and_placement);
+    return engine;
+  }
+  /// Create the class for calculate 3-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] n3 The size of the 3rd dimension (outermost) of the data.
+  /// \param [in] n2 The size of the 2nd dimension of the data.
+  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
+  /// \param [in] type The FFT type.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  static fft_engine *
+  create(sycl::queue *exec_queue, int n3, int n2, int n1, fft_type type,
+         std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+             direction_and_placement = std::nullopt) {
+    fft_engine *engine = new fft_engine();
+    engine->commit(exec_queue, n3, n2, n1, type, nullptr,
+                   direction_and_placement);
+    return engine;
+  }
+  /// Create the class for calculate n-D FFT.
+  /// \param [in] exec_queue The queue where the calculation should be executed.
+  /// \param [in] dim Dimension number of the data.
+  /// \param [in] n Pointer to an array containing each dimension's size.
+  /// \param [in] inembed Pointer to an array containing each dimension's size
+  /// of the embedded input data.
+  /// \param [in] istride Stride size of the input data.
+  /// \param [in] idist Distance between the two batches of the input data.
+  /// \param [in] onembed Pointer to an array containing each dimension's size
+  /// of the embedded output data.
+  /// \param [in] ostride Stride size of the output data.
+  /// \param [in] odist Distance between the two batches of the output data.
+  /// \param [in] type The FFT type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If this value is specified, the direction parameter
+  /// will be ignored in the fft_engine::compute function. If it is not set,
+  /// forward direction(if current FFT is complex-to-complex) and out-of-place
+  /// (false) are set by default.
+  static fft_engine *
+  create(sycl::queue *exec_queue, int dim, int *n, int *inembed, int istride,
+         int idist, int *onembed, int ostride, int odist, fft_type type,
+         int batch,
+         std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+             direction_and_placement = std::nullopt) {
+    fft_engine *engine = new fft_engine();
+    engine->commit(exec_queue, dim, n, inembed, istride, idist, onembed,
+                   ostride, odist, type, batch, nullptr,
+                   direction_and_placement);
+    return engine;
+  }
+  /// Create the class for calculate FFT without commit any config.
+  static fft_engine *create() {
+    fft_engine *engine = new fft_engine();
+    return engine;
+  }
+  /// Destroy the class for calculate FFT.
+  /// \param [in] engine Pointer returned from fft_engine::craete.
+  static void destroy(fft_engine *engine) { delete engine; }
+
+#ifdef __INTEL_MKL__
+  /// Estimates the workspace size for calculating n-D FFT.
+  /// \param [in] dim Dimension number of the data.
+  /// \param [in] n Pointer to an array containing each dimension's size.
+  /// \param [in] inembed Pointer to an array containing each dimension's size
+  /// of the embedded input data.
+  /// \param [in] istride Stride size of the input data.
+  /// \param [in] idist Distance between the two batches of the input data.
+  /// \param [in] onembed Pointer to an array containing each dimension's size
+  /// of the embedded output data.
+  /// \param [in] ostride Stride size of the output data.
+  /// \param [in] odist Distance between the two batches of the output data.
+  /// \param [in] type The FFT type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [out] estimated_scratchpad_size The estimated workspace size
+  /// required for this FFT. If this value is used to allocate memory,
+  /// \p direction_and_placement need to be specified explicitly to get correct
+  /// result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT
+  /// direction and placement info. If it is not set, forward direction(if
+  /// current FFT is complex-to-complex) and out-of-place (false) are set by default.
+  static void
+  estimate_size(int dim, long long *n, long long *inembed, long long istride,
+                long long idist, long long *onembed, long long ostride,
+                long long odist, fft_type type, long long batch,
+                size_t *estimated_scratchpad_size,
+                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                    direction_and_placement = std::nullopt) {
+    fft_engine *engine = fft_engine::create();
+    engine->_is_estimate_call = true;
+    engine->commit(&dpct::get_default_queue(), dim, n, inembed, istride, idist,
+                   fft_type_to_data_type(type).first, onembed, ostride, odist,
+                   fft_type_to_data_type(type).second, batch,
+                   estimated_scratchpad_size, direction_and_placement);
+    fft_engine::destroy(engine);
+  }
+  /// Estimates the workspace size for calculating n-D FFT.
+  /// \param [in] dim Dimension number of the data.
+  /// \param [in] n Pointer to an array containing each dimension's size.
+  /// \param [in] inembed Pointer to an array containing each dimension's size
+  /// of the embedded input data.
+  /// \param [in] istride Stride size of the input data.
+  /// \param [in] idist Distance between the two batches of the input data.
+  /// \param [in] onembed Pointer to an array containing each dimension's size
+  /// of the embedded output data.
+  /// \param [in] ostride Stride size of the output data.
+  /// \param [in] odist Distance between the two batches of the output data.
+  /// \param [in] type The FFT type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [out] estimated_scratchpad_size The estimated workspace size
+  /// required for this FFT. If this value is used to allocate memory,
+  /// \p direction_and_placement need to be specified explicitly to get correct
+  /// result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT
+  /// direction and placement info. If it is not set, forward direction(if
+  /// current FFT is complex-to-complex) and out-of-place (false) are set by default.
+  static void
+  estimate_size(int dim, int *n, int *inembed, int istride, int idist,
+                int *onembed, int ostride, int odist, fft_type type, int batch,
+                size_t *estimated_scratchpad_size,
+                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                    direction_and_placement = std::nullopt) {
+    fft_engine *engine = fft_engine::create();
+    engine->_is_estimate_call = true;
+    engine->commit(&dpct::get_default_queue(), dim, n, inembed, istride, idist,
+                   fft_type_to_data_type(type).first, onembed, ostride, odist,
+                   fft_type_to_data_type(type).second, batch,
+                   estimated_scratchpad_size, direction_and_placement);
+    fft_engine::destroy(engine);
+  }
+  /// Estimates the workspace size for calculating 1-D FFT.
+  /// \param [in] n1 The size of the dimension of the data.
+  /// \param [in] type The FFT type.
+  /// \param [in] batch The number of FFT operations to perform.
+  /// \param [out] estimated_scratchpad_size The estimated workspace size
+  /// required for this FFT. If this value is used to allocate memory,
+  /// \p direction_and_placement need to be specified explicitly to get correct
+  /// result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT direction
+  /// and placement info. If it is not set, forward direction(if current FFT is
+  /// complex-to-complex) and out-of-place (false) are set by default.
+  static void
+  estimate_size(int n1, fft_type type, int batch,
+                size_t *estimated_scratchpad_size,
+                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                    direction_and_placement = std::nullopt) {
+    fft_engine *engine = fft_engine::create();
+    engine->_is_estimate_call = true;
+    engine->commit(&dpct::get_default_queue(), n1, type, batch,
+                   estimated_scratchpad_size, direction_and_placement);
+    fft_engine::destroy(engine);
+  }
+  /// Estimates the workspace size for calculating 2-D FFT.
+  /// \param [in] n2 The size of the 2nd dimension (outermost) of the data.
+  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
+  /// \param [in] type The FFT type.
+  /// \param [out] estimated_scratchpad_size The estimated workspace size
+  /// required for this FFT. If this value is used to allocate memory,
+  /// \p direction_and_placement need to be specified explicitly to get correct
+  /// result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT
+  /// direction and placement info. If it is not set, forward direction(if
+  /// current FFT is complex-to-complex) and out-of-place (false) are set by default.
+  static void
+  estimate_size(int n2, int n1, fft_type type,
+                size_t *estimated_scratchpad_size,
+                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                    direction_and_placement = std::nullopt) {
+    fft_engine *engine = fft_engine::create();
+    engine->_is_estimate_call = true;
+    engine->commit(&dpct::get_default_queue(), n2, n1, type,
+                   estimated_scratchpad_size, direction_and_placement);
+    fft_engine::destroy(engine);
+  }
+  /// Estimates the workspace size for calculating 3-D FFT.
+  /// \param [in] n3 The size of the 3rd dimension (outermost) of the data.
+  /// \param [in] n2 The size of the 2nd dimension of the data.
+  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
+  /// \param [in] type The FFT type.
+  /// \param [out] estimated_scratchpad_size The estimated workspace size
+  /// required for this FFT. If this value is used to allocate memory,
+  /// \p direction_and_placement need to be specified explicitly to get correct
+  /// result.
+  /// \param [in] direction_and_placement Explicitly specify the FFT
+  /// direction and placement info. If it is not set, forward direction(if
+  /// current FFT is complex-to-complex) and out-of-place (false) are set by default.
+  static void
+  estimate_size(int n3, int n2, int n1, fft_type type,
+                size_t *estimated_scratchpad_size,
+                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                    direction_and_placement = std::nullopt) {
+    fft_engine *engine = fft_engine::create();
+    engine->_is_estimate_call = true;
+    engine->commit(&dpct::get_default_queue(), n3, n2, n1, type,
+                   estimated_scratchpad_size, direction_and_placement);
+    fft_engine::destroy(engine);
+  }
+#endif
+
+  /// Execute the FFT calculation.
+  /// \param [in] input Pointer to the input data.
+  /// \param [out] output Pointer to the output data.
+  /// \param [in] direction The FFT direction.
+  template <typename input_t, typename output_t>
+  void compute(input_t *input, output_t *output, fft_direction direction) {
+    if (_input_type == library_data_t::complex_float &&
+        _output_type == library_data_t::complex_float) {
+      compute_complex<float, oneapi::mkl::dft::precision::SINGLE>(
+          (float *)input, (float *)output, direction);
+    } else if (_input_type == library_data_t::complex_double &&
+               _output_type == library_data_t::complex_double) {
+      compute_complex<double, oneapi::mkl::dft::precision::DOUBLE>(
+          (double *)input, (double *)output, direction);
+    } else if (_input_type == library_data_t::real_float &&
+               _output_type == library_data_t::complex_float) {
+      _direction = direction;
+      compute_real<float, oneapi::mkl::dft::precision::SINGLE>((float *)input,
+                                                               (float *)output);
+    } else if (_input_type == library_data_t::complex_float &&
+               _output_type == library_data_t::real_float) {
+      _direction = direction;
+      compute_real<float, oneapi::mkl::dft::precision::SINGLE>((float *)input,
+                                                               (float *)output);
+    } else if (_input_type == library_data_t::real_double &&
+               _output_type == library_data_t::complex_double) {
+      _direction = direction;
+      compute_real<double, oneapi::mkl::dft::precision::DOUBLE>(
+          (double *)input, (double *)output);
+    } else if (_input_type == library_data_t::complex_double &&
+               _output_type == library_data_t::real_double) {
+      _direction = direction;
+      compute_real<double, oneapi::mkl::dft::precision::DOUBLE>(
+          (double *)input, (double *)output);
+    }
+  }
+  template <>
+  void compute(float *input, sycl::float2 *output, fft_direction direction) {
+    _direction = direction;
+    compute_real<float, oneapi::mkl::dft::precision::SINGLE>((float *)input,
+                                                             (float *)output);
+  }
+  template <>
+  void compute(sycl::float2 *input, float *output, fft_direction direction) {
+    _direction = direction;
+    compute_real<float, oneapi::mkl::dft::precision::SINGLE>((float *)input,
+                                                             (float *)output);
+  }
+  template <>
+  void compute(double *input, sycl::double2 *output, fft_direction direction) {
+    _direction = direction;
+    compute_real<double, oneapi::mkl::dft::precision::DOUBLE>((double *)input,
+                                                              (double *)output);
+  }
+  template <>
+  void compute(sycl::double2 *input, double *output, fft_direction direction) {
+    _direction = direction;
+    compute_real<double, oneapi::mkl::dft::precision::DOUBLE>((double *)input,
+                                                              (double *)output);
+  }
+  template <>
+  void compute(sycl::float2 *input, sycl::float2 *output,
+               fft_direction direction) {
+    compute_complex<float, oneapi::mkl::dft::precision::SINGLE>(
+        (float *)input, (float *)output, direction);
+  }
+  template <>
+  void compute(sycl::double2 *input, sycl::double2 *output,
+               fft_direction direction) {
+    compute_complex<double, oneapi::mkl::dft::precision::DOUBLE>(
+        (double *)input, (double *)output, direction);
+  }
+  /// Setting the user's SYCL queue for calculation.
+  /// \param [in] q Pointer to the SYCL queue.
+  void set_queue(sycl::queue *q) { _q = q; }
+#ifdef __INTEL_MKL__
+  /// Setting whether to use external or internal workspace.
+  /// \param [in] flag True means using internal workspace. False means using
+  /// external workspace.
+  void use_internal_workspace(bool flag = true) {
+    _use_external_workspace = !flag;
+  }
+  /// Specify the external workspace.
+  /// \param [in] ptr Pointer to the workspace.
+  void set_workspace(void *ptr) {
+    if (!_use_external_workspace) {
+      return;
+    }
+    if (_input_type == library_data_t::complex_float &&
+        _output_type == library_data_t::complex_float) {
+      if (_q->get_device().is_gpu()) {
+        auto data = dpct::detail::get_memory<float>(ptr);
+        _desc_sc->set_workspace(data);
+      }
+    } else if (_input_type == library_data_t::complex_double &&
+               _output_type == library_data_t::complex_double) {
+      if (_q->get_device().is_gpu()) {
+        auto data = dpct::detail::get_memory<double>(ptr);
+        _desc_dc->set_workspace(data);
+      }
+    } else if ((_input_type == library_data_t::real_float &&
+                _output_type == library_data_t::complex_float) ||
+               (_input_type == library_data_t::complex_float &&
+                _output_type == library_data_t::real_float)) {
+      if (_q->get_device().is_gpu()) {
+        auto data = dpct::detail::get_memory<float>(ptr);
+        _desc_sr->set_workspace(data);
+      }
+    } else if ((_input_type == library_data_t::real_double &&
+                _output_type == library_data_t::complex_double) ||
+               (_input_type == library_data_t::complex_double &&
+                _output_type == library_data_t::real_double)) {
+      if (_q->get_device().is_gpu()) {
+        auto data = dpct::detail::get_memory<double>(ptr);
+        _desc_dr->set_workspace(data);
+      }
+    } else {
+      throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                            "invalid fft type");
+    }
+  }
+#endif
+  /// Get the workspace size.
+  /// \param [out] scratchpad_size Workspace size in bytes.
+  void get_workspace_size(size_t *scratchpad_size) {
+    if (scratchpad_size) {
+      *scratchpad_size = _workspace_bytes;
+    }
+  }
+
+private:
+  static std::pair<library_data_t, library_data_t>
+  fft_type_to_data_type(fft_type type) {
+    switch (type) {
+    case fft_type::real_float_to_complex_float: {
+      return std::make_pair(library_data_t::real_float,
+                            library_data_t::complex_float);
+    }
+    case fft_type::complex_float_to_real_float: {
+      return std::make_pair(library_data_t::complex_float,
+                            library_data_t::real_float);
+    }
+    case fft_type::real_double_to_complex_double: {
+      return std::make_pair(library_data_t::real_double,
+                            library_data_t::complex_double);
+    }
+    case fft_type::complex_double_to_real_double: {
+      return std::make_pair(library_data_t::complex_double,
+                            library_data_t::real_double);
+    }
+    case fft_type::complex_float_to_complex_float: {
+      return std::make_pair(library_data_t::complex_float,
+                            library_data_t::complex_float);
+    }
+    case fft_type::complex_double_to_complex_double: {
+      return std::make_pair(library_data_t::complex_double,
+                            library_data_t::complex_double);
+    }
+    }
+  }
+
+  void config_and_commit_basic() {
+    if (_input_type == library_data_t::complex_float &&
+        _output_type == library_data_t::complex_float) {
+      _desc_sc = std::make_shared<
+          oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE,
+                                       oneapi::mkl::dft::domain::COMPLEX>>(_n);
+      std::int64_t distance = 1;
+      for (auto i : _n)
+        distance = distance * i;
+      _fwd_dist = distance;
+      _bwd_dist = distance;
+      _desc_sc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
+                          distance);
+      _desc_sc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
+                          distance);
+      _desc_sc->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
+                          _batch);
+#ifdef __INTEL_MKL__
+      if (_is_user_specified_dir_and_placement && _is_inplace)
+        _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            DFTI_CONFIG_VALUE::DFTI_INPLACE);
+      else
+        _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
+      if (_use_external_workspace) {
+        if (_q->get_device().is_gpu()) {
+          _desc_sc->set_value(
+              oneapi::mkl::dft::config_param::WORKSPACE,
+              oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+        }
+      }
+      if (_is_estimate_call) {
+        if (_q->get_device().is_gpu()) {
+          _desc_sc->get_value(
+              oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,
+              &_workspace_estimate_bytes);
+        }
+      } else {
+        _desc_sc->commit(*_q);
+        if (_q->get_device().is_gpu()) {
+          _desc_sc->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
+                              &_workspace_bytes);
+        }
+      }
+#else
+      if (_is_user_specified_dir_and_placement && _is_inplace)
+        _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            oneapi::mkl::dft::config_value::INPLACE);
+      else
+        _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            oneapi::mkl::dft::config_value::NOT_INPLACE);
+      _desc_sc->commit(*_q);
+#endif
+    } else if (_input_type == library_data_t::complex_double &&
+               _output_type == library_data_t::complex_double) {
+      _desc_dc = std::make_shared<
+          oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE,
+                                       oneapi::mkl::dft::domain::COMPLEX>>(_n);
+      std::int64_t distance = 1;
+      for (auto i : _n)
+        distance = distance * i;
+      _fwd_dist = distance;
+      _bwd_dist = distance;
+      _desc_dc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
+                          distance);
+      _desc_dc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
+                          distance);
+      _desc_dc->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
+                          _batch);
+#ifdef __INTEL_MKL__
+      if (_is_user_specified_dir_and_placement && _is_inplace)
+        _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            DFTI_CONFIG_VALUE::DFTI_INPLACE);
+      else
+        _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
+      if (_use_external_workspace) {
+        if (_q->get_device().is_gpu()) {
+          _desc_dc->set_value(
+              oneapi::mkl::dft::config_param::WORKSPACE,
+              oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+        }
+      }
+      if (_is_estimate_call) {
+        if (_q->get_device().is_gpu()) {
+          _desc_dc->get_value(
+              oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,
+              &_workspace_estimate_bytes);
+        }
+      } else {
+        _desc_dc->commit(*_q);
+        if (_q->get_device().is_gpu()) {
+          _desc_dc->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
+                              &_workspace_bytes);
+        }
+      }
+#else
+      if (_is_user_specified_dir_and_placement && _is_inplace)
+        _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            oneapi::mkl::dft::config_value::INPLACE);
+      else
+        _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            oneapi::mkl::dft::config_value::NOT_INPLACE);
+      _desc_dc->commit(*_q);
+#endif
+    } else if ((_input_type == library_data_t::real_float &&
+                _output_type == library_data_t::complex_float) ||
+               (_input_type == library_data_t::complex_float &&
+                _output_type == library_data_t::real_float)) {
+      _desc_sr = std::make_shared<oneapi::mkl::dft::descriptor<
+          oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>>(
+          _n);
+      if (_input_type == library_data_t::real_float &&
+          _output_type == library_data_t::complex_float)
+        _direction = fft_direction::forward;
+      else
+        _direction = fft_direction::backward;
+      _desc_sr->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
+                          _batch);
+#ifdef __INTEL_MKL__
+      if (_is_user_specified_dir_and_placement && _is_inplace) {
+        _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            DFTI_CONFIG_VALUE::DFTI_INPLACE);
+        set_stride_and_distance_basic<true>(_desc_sr);
+      } else {
+        _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
+        set_stride_and_distance_basic<false>(_desc_sr);
+      }
+      if (_use_external_workspace) {
+        if (_q->get_device().is_gpu()) {
+          _desc_sr->set_value(
+              oneapi::mkl::dft::config_param::WORKSPACE,
+              oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+        }
+      }
+      if (_is_estimate_call) {
+        if (_q->get_device().is_gpu()) {
+          _desc_sr->get_value(
+              oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,
+              &_workspace_estimate_bytes);
+        }
+      } else {
+        _desc_sr->commit(*_q);
+        if (_q->get_device().is_gpu()) {
+          _desc_sr->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
+                              &_workspace_bytes);
+        }
+      }
+#else
+      if (_is_user_specified_dir_and_placement && _is_inplace) {
+        _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            oneapi::mkl::dft::config_value::INPLACE);
+        set_stride_and_distance_basic<true>(_desc_sr);
+      } else {
+        _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            oneapi::mkl::dft::config_value::NOT_INPLACE);
+        set_stride_and_distance_basic<false>(_desc_sr);
+      }
+      _desc_sr->commit(*_q);
+#endif
+    } else if ((_input_type == library_data_t::real_double &&
+                _output_type == library_data_t::complex_double) ||
+               (_input_type == library_data_t::complex_double &&
+                _output_type == library_data_t::real_double)) {
+      _desc_dr = std::make_shared<oneapi::mkl::dft::descriptor<
+          oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL>>(
+          _n);
+      if (_input_type == library_data_t::real_double &&
+          _output_type == library_data_t::complex_double)
+        _direction = fft_direction::forward;
+      else
+        _direction = fft_direction::backward;
+      _desc_dr->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
+                          _batch);
+#ifdef __INTEL_MKL__
+      if (_is_user_specified_dir_and_placement && _is_inplace) {
+        _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            DFTI_CONFIG_VALUE::DFTI_INPLACE);
+        set_stride_and_distance_basic<true>(_desc_dr);
+      } else {
+        _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
+        set_stride_and_distance_basic<false>(_desc_dr);
+      }
+      if (_use_external_workspace) {
+        if (_q->get_device().is_gpu()) {
+          _desc_dr->set_value(
+              oneapi::mkl::dft::config_param::WORKSPACE,
+              oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
+        }
+      }
+      if (_is_estimate_call) {
+        if (_q->get_device().is_gpu()) {
+          _desc_dr->get_value(
+              oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,
+              &_workspace_estimate_bytes);
+        }
+      } else {
+        _desc_dr->commit(*_q);
+        if (_q->get_device().is_gpu()) {
+          _desc_dr->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
+                              &_workspace_bytes);
+        }
+      }
+#else
+      if (_is_user_specified_dir_and_placement && _is_inplace) {
+        _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            oneapi::mkl::dft::config_value::INPLACE);
+        set_stride_and_distance_basic<true>(_desc_dr);
+      } else {
+        _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                            oneapi::mkl::dft::config_value::NOT_INPLACE);
+        set_stride_and_distance_basic<false>(_desc_dr);
+      }
+      _desc_dr->commit(*_q);
+#endif
+    } else {
+      throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                            "invalid fft type");
+    }
+  }
+
+  void config_and_commit_advanced() {
+#ifdef __INTEL_MKL__
+#define CONFIG_AND_COMMIT(DESC, PREC, DOM, TYPE)                               \
+  {                                                                            \
+    DESC = std::make_shared<oneapi::mkl::dft::descriptor<                      \
+        oneapi::mkl::dft::precision::PREC, oneapi::mkl::dft::domain::DOM>>(    \
+        _n);                                                                   \
+    set_stride_advanced(DESC);                                                 \
+    DESC->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, _fwd_dist);  \
+    DESC->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, _bwd_dist);  \
+    DESC->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,      \
+                    _batch);                                                   \
+    if (_is_user_specified_dir_and_placement && _is_inplace)                   \
+      DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT,               \
+                      DFTI_CONFIG_VALUE::DFTI_INPLACE);                        \
+    else                                                                       \
+      DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT,               \
+                      DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);                    \
+    if (_use_external_workspace) {                                             \
+      DESC->set_value(oneapi::mkl::dft::config_param::WORKSPACE,               \
+                      oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);     \
+    }                                                                          \
+    if (_is_estimate_call) {                                                   \
+      if (_q->get_device().is_gpu()) {                                         \
+        DESC->get_value(                                                       \
+            oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,          \
+            &_workspace_estimate_bytes);                                       \
+      }                                                                        \
+    } else {                                                                   \
+      DESC->commit(*_q);                                                       \
+      if (_is_estimate_call) {                                                 \
+        DESC->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,       \
+                        &_workspace_bytes);                                    \
+      }                                                                        \
+    }                                                                          \
+  }
+#else
+#define CONFIG_AND_COMMIT(DESC, PREC, DOM, TYPE)                               \
+  {                                                                            \
+    DESC = std::make_shared<oneapi::mkl::dft::descriptor<                      \
+        oneapi::mkl::dft::precision::PREC, oneapi::mkl::dft::domain::DOM>>(    \
+        _n);                                                                   \
+    set_stride_advanced(DESC);                                                 \
+    DESC->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, _fwd_dist);  \
+    DESC->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, _bwd_dist);  \
+    DESC->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,      \
+                    _batch);                                                   \
+    if (_is_user_specified_dir_and_placement && _is_inplace)                   \
+      DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT,               \
+                      oneapi::mkl::dft::config_value::INPLACE);                \
+    else                                                                       \
+      DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT,               \
+                      oneapi::mkl::dft::config_value::NOT_INPLACE);            \
+    DESC->commit(*_q);                                                         \
+  }
+#endif
+
+    if (_input_type == library_data_t::complex_float &&
+        _output_type == library_data_t::complex_float) {
+      CONFIG_AND_COMMIT(_desc_sc, SINGLE, COMPLEX, float);
+    } else if (_input_type == library_data_t::complex_double &&
+               _output_type == library_data_t::complex_double) {
+      CONFIG_AND_COMMIT(_desc_dc, DOUBLE, COMPLEX, double);
+    } else if ((_input_type == library_data_t::real_float &&
+                _output_type == library_data_t::complex_float) ||
+               (_input_type == library_data_t::complex_float &&
+                _output_type == library_data_t::real_float)) {
+      CONFIG_AND_COMMIT(_desc_sr, SINGLE, REAL, float);
+    } else if ((_input_type == library_data_t::real_double &&
+                _output_type == library_data_t::complex_double) ||
+               (_input_type == library_data_t::complex_double &&
+                _output_type == library_data_t::real_double)) {
+      CONFIG_AND_COMMIT(_desc_dr, DOUBLE, REAL, double);
+    } else {
+      throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                            "invalid fft type");
+    }
+#undef CONFIG_AND_COMMIT
+  }
+
+  template <typename T>
+  void init(int dim, T *n, T *inembed, T istride, T idist,
+            library_data_t input_type, T *onembed, T ostride, T odist,
+            library_data_t output_type, T batch,
+            std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
+                direction_and_placement) {
+    if (direction_and_placement.has_value()) {
+      _is_user_specified_dir_and_placement = true;
+      _direction = direction_and_placement->first;
+      _is_inplace = direction_and_placement->second;
+    }
+    _n.resize(dim);
+    _inembed.resize(dim);
+    _onembed.resize(dim);
+    _input_type = input_type;
+    _output_type = output_type;
+    for (int i = 0; i < dim; i++) {
+      _n[i] = n[i];
+    }
+    if (inembed && onembed) {
+      for (int i = 0; i < dim; i++) {
+        _inembed[i] = inembed[i];
+        _onembed[i] = onembed[i];
+      }
+      _istride = istride;
+      _ostride = ostride;
+
+      if ((_input_type == library_data_t::real_float &&
+           _output_type == library_data_t::complex_float) ||
+          (_input_type == library_data_t::real_double &&
+           _output_type == library_data_t::complex_double)) {
+        _fwd_dist = idist;
+        _bwd_dist = odist;
+      } else if ((_output_type == library_data_t::real_float &&
+                  _input_type == library_data_t::complex_float) ||
+                 (_output_type == library_data_t::real_double &&
+                  _input_type == library_data_t::complex_double)) {
+        _fwd_dist = odist;
+        _bwd_dist = idist;
+      } else {
+        if (_is_user_specified_dir_and_placement &&
+            (_direction == fft_direction::backward)) {
+          _fwd_dist = odist;
+          _bwd_dist = idist;
+        } else {
+          _fwd_dist = idist;
+          _bwd_dist = odist;
+        }
+      }
+    } else {
+      _is_basic = true;
+    }
+    _batch = batch;
+    _dim = dim;
+
+    if (_is_basic)
+      config_and_commit_basic();
+    else
+      config_and_commit_advanced();
+  }
+  template <class Desc_t>
+  void set_stride_advanced(std::shared_ptr<Desc_t> desc) {
+    if (_dim == 1) {
+      std::int64_t input_stride[2] = {0, _istride};
+      std::int64_t output_stride[2] = {0, _ostride};
+      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,
+                      input_stride);
+      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
+                      output_stride);
+    } else if (_dim == 2) {
+      std::int64_t input_stride[3] = {0, _inembed[1] * _istride, _istride};
+      std::int64_t output_stride[3] = {0, _onembed[1] * _ostride, _ostride};
+      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,
+                      input_stride);
+      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
+                      output_stride);
+    } else if (_dim == 3) {
+      std::int64_t input_stride[4] = {0, _inembed[2] * _inembed[1] * _istride,
+                                      _inembed[2] * _istride, _istride};
+      std::int64_t output_stride[4] = {0, _onembed[2] * _onembed[1] * _ostride,
+                                       _onembed[2] * _ostride, _ostride};
+      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,
+                      input_stride);
+      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
+                      output_stride);
+    }
+  }
+
+  template <class Desc_t> void swap_distance(std::shared_ptr<Desc_t> desc) {
+    desc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, _bwd_dist);
+    desc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, _fwd_dist);
+    std::int64_t temp = _bwd_dist;
+    _bwd_dist = _fwd_dist;
+    _fwd_dist = temp;
+  }
+
+  template <bool Is_inplace, class Desc_t>
+  void set_stride_and_distance_basic(std::shared_ptr<Desc_t> desc) {
+    std::int64_t forward_distance = 0;
+    std::int64_t backward_distance = 0;
+
+#define SET_STRIDE                                                             \
+  {                                                                            \
+    if (_direction == fft_direction::forward) {                                \
+      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,           \
+                      real_stride);                                            \
+      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,          \
+                      complex_stride);                                         \
+    } else {                                                                   \
+      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,           \
+                      complex_stride);                                         \
+      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,          \
+                      real_stride);                                            \
+    }                                                                          \
+  }
+    if (_dim == 1) {
+      if constexpr (Is_inplace) {
+        std::int64_t real_stride[2] = {0, 1};
+        std::int64_t complex_stride[2] = {0, 1};
+        SET_STRIDE;
+        forward_distance = 2 * (_n[0] / 2 + 1);
+        backward_distance = _n[0] / 2 + 1;
+      } else {
+        std::int64_t real_stride[2] = {0, 1};
+        std::int64_t complex_stride[2] = {0, 1};
+        SET_STRIDE;
+        forward_distance = _n[0];
+        backward_distance = _n[0] / 2 + 1;
+      }
+    } else if (_dim == 2) {
+      if constexpr (Is_inplace) {
+        std::int64_t complex_stride[3] = {0, _n[1] / 2 + 1, 1};
+        std::int64_t real_stride[3] = {0, 2 * (_n[1] / 2 + 1), 1};
+        SET_STRIDE;
+        forward_distance = _n[0] * 2 * (_n[1] / 2 + 1);
+        backward_distance = _n[0] * (_n[1] / 2 + 1);
+      } else {
+        std::int64_t complex_stride[3] = {0, _n[1] / 2 + 1, 1};
+        std::int64_t real_stride[3] = {0, _n[1], 1};
+        SET_STRIDE;
+        forward_distance = _n[0] * _n[1];
+        backward_distance = _n[0] * (_n[1] / 2 + 1);
+      }
+    } else if (_dim == 3) {
+      if constexpr (Is_inplace) {
+        std::int64_t complex_stride[4] = {0, _n[1] * (_n[2] / 2 + 1),
+                                          _n[2] / 2 + 1, 1};
+        std::int64_t real_stride[4] = {0, _n[1] * 2 * (_n[2] / 2 + 1),
+                                       2 * (_n[2] / 2 + 1), 1};
+        SET_STRIDE;
+        forward_distance = _n[0] * _n[1] * 2 * (_n[2] / 2 + 1);
+        backward_distance = _n[0] * _n[1] * (_n[2] / 2 + 1);
+      } else {
+        std::int64_t complex_stride[4] = {0, _n[1] * (_n[2] / 2 + 1),
+                                          _n[2] / 2 + 1, 1};
+        std::int64_t real_stride[4] = {0, _n[1] * _n[2], _n[2], 1};
+        SET_STRIDE;
+        forward_distance = _n[0] * _n[1] * _n[2];
+        backward_distance = _n[0] * _n[1] * (_n[2] / 2 + 1);
+      }
+    }
+#undef SET_STRIDE
+    desc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
+                    forward_distance);
+    desc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
+                    backward_distance);
+  }
+
+#define COMPUTE(DESC)                                                          \
+  {                                                                            \
+    if (_is_inplace) {                                                         \
+      auto data_input = dpct::detail::get_memory<T>(input);                    \
+      if (_direction == fft_direction::forward) {                              \
+        oneapi::mkl::dft::compute_forward<                                     \
+            std::remove_reference_t<decltype(*DESC)>, T>(*DESC, data_input);   \
+      } else {                                                                 \
+        oneapi::mkl::dft::compute_backward<                                    \
+            std::remove_reference_t<decltype(*DESC)>, T>(*DESC, data_input);   \
+      }                                                                        \
+    } else {                                                                   \
+      auto data_input = dpct::detail::get_memory<T>(input);                    \
+      auto data_output = dpct::detail::get_memory<T>(output);                  \
+      if (_direction == fft_direction::forward) {                              \
+        oneapi::mkl::dft::compute_forward<                                     \
+            std::remove_reference_t<decltype(*DESC)>, T, T>(*DESC, data_input, \
+                                                            data_output);      \
+      } else {                                                                 \
+        oneapi::mkl::dft::compute_backward<                                    \
+            std::remove_reference_t<decltype(*DESC)>, T, T>(*DESC, data_input, \
+                                                            data_output);      \
+      }                                                                        \
+    }                                                                          \
+  }
+
+  template <class T, oneapi::mkl::dft::precision Precision>
+  void compute_complex(T *input, T *output, fft_direction direction) {
+    bool is_this_compute_inplace = input == output;
+
+    if (!_is_user_specified_dir_and_placement) {
+      // The complex domain descriptor need different config values if the
+      // FFT direction or placement is different.
+      // Here we check the conditions, and new config values are set and
+      // re-committed if needed.
+      if (direction != _direction || is_this_compute_inplace != _is_inplace) {
+        if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) {
+          if (direction != _direction) {
+            swap_distance(_desc_sc);
+            _direction = direction;
+          }
+          if (is_this_compute_inplace != _is_inplace) {
+            _is_inplace = is_this_compute_inplace;
+#ifdef __INTEL_MKL__
+            if (_is_inplace) {
+              _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                  DFTI_CONFIG_VALUE::DFTI_INPLACE);
+            } else {
+              _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                  DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
+            }
+#else
+            if (_is_inplace) {
+              _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                  oneapi::mkl::dft::config_value::INPLACE);
+            } else {
+              _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                  oneapi::mkl::dft::config_value::NOT_INPLACE);
+            }
+#endif
+          }
+          _desc_sc->commit(*_q);
+        } else {
+          if (direction != _direction) {
+            swap_distance(_desc_dc);
+            _direction = direction;
+          }
+          if (is_this_compute_inplace != _is_inplace) {
+            _is_inplace = is_this_compute_inplace;
+#ifdef __INTEL_MKL__
+            if (_is_inplace) {
+              _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                  DFTI_CONFIG_VALUE::DFTI_INPLACE);
+            } else {
+              _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                  DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
+            }
+#else
+            if (_is_inplace) {
+              _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                  oneapi::mkl::dft::config_value::INPLACE);
+            } else {
+              _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                  oneapi::mkl::dft::config_value::NOT_INPLACE);
+            }
+#endif
+          }
+          _desc_dc->commit(*_q);
+        }
+      }
+    }
+
+    if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) {
+      COMPUTE(_desc_sc);
+    } else {
+      COMPUTE(_desc_dc);
+    }
+  }
+
+  template <class T, oneapi::mkl::dft::precision Precision>
+  void compute_real(T *input, T *output) {
+    bool is_this_compute_inplace = input == output;
+
+    if (!_is_user_specified_dir_and_placement) {
+      // The real domain descriptor need different config values if the
+      // FFT placement is different.
+      // Here we check the condition, and new config values are set and
+      // re-committed if needed.
+      if (is_this_compute_inplace != _is_inplace) {
+        if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) {
+          _is_inplace = is_this_compute_inplace;
+          if (_is_inplace) {
+#ifdef __INTEL_MKL__
+            _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                DFTI_CONFIG_VALUE::DFTI_INPLACE);
+#else
+            _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                oneapi::mkl::dft::config_value::INPLACE);
+#endif
+            if (_is_basic)
+              set_stride_and_distance_basic<true>(_desc_sr);
+          } else {
+#ifdef __INTEL_MKL__
+            _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
+#else
+            _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                oneapi::mkl::dft::config_value::NOT_INPLACE);
+#endif
+            if (_is_basic)
+              set_stride_and_distance_basic<false>(_desc_sr);
+          }
+          _desc_sr->commit(*_q);
+        } else {
+          _is_inplace = is_this_compute_inplace;
+          if (_is_inplace) {
+#ifdef __INTEL_MKL__
+            _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                DFTI_CONFIG_VALUE::DFTI_INPLACE);
+#else
+            _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                oneapi::mkl::dft::config_value::INPLACE);
+#endif
+            if (_is_basic)
+              set_stride_and_distance_basic<true>(_desc_dr);
+          } else {
+#ifdef __INTEL_MKL__
+            _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
+#else
+            _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
+                                oneapi::mkl::dft::config_value::NOT_INPLACE);
+#endif
+            if (_is_basic)
+              set_stride_and_distance_basic<false>(_desc_dr);
+          }
+          _desc_dr->commit(*_q);
+        }
+      }
+    }
+
+    if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) {
+      COMPUTE(_desc_sr);
+    } else {
+      COMPUTE(_desc_dr);
+    }
+  }
+#undef COMPUTE
+
+private:
+  sycl::queue *_q = nullptr;
+  int _dim;
+  std::vector<std::int64_t> _n;
+  std::vector<std::int64_t> _inembed;
+  std::int64_t _istride;
+  std::int64_t _fwd_dist;
+  library_data_t _input_type;
+  std::vector<std::int64_t> _onembed;
+  std::int64_t _ostride;
+  std::int64_t _bwd_dist;
+  library_data_t _output_type;
+  std::int64_t _batch = 1;
+  bool _is_basic = false;
+  bool _is_inplace = false;
+  fft_direction _direction = fft_direction::forward;
+  bool _is_user_specified_dir_and_placement = false;
+  bool _use_external_workspace = false;
+  void *_external_workspace_ptr = nullptr;
+  size_t _workspace_bytes = 0;
+  bool _is_estimate_call = false;
+  size_t _workspace_estimate_bytes = 0;
+  std::shared_ptr<oneapi::mkl::dft::descriptor<
+      oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>>
+      _desc_sr;
+  std::shared_ptr<oneapi::mkl::dft::descriptor<
+      oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL>>
+      _desc_dr;
+  std::shared_ptr<oneapi::mkl::dft::descriptor<
+      oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::COMPLEX>>
+      _desc_sc;
+  std::shared_ptr<oneapi::mkl::dft::descriptor<
+      oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::COMPLEX>>
+      _desc_dc;
+};
+
+using fft_engine_ptr = fft_engine *;
+} // namespace fft
+} // namespace dpct
+
+#endif // __DPCT_FFT_UTILS_HPP__
diff --git a/dpct/image.hpp b/dpct/image.hpp
new file mode 100644
index 0000000000000..b9bb246685e7b
--- /dev/null
+++ b/dpct/image.hpp
@@ -0,0 +1,901 @@
+//==---- image.hpp --------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_IMAGE_HPP__
+#define __DPCT_IMAGE_HPP__
+
+#include <sycl/sycl.hpp>
+
+#include "memory.hpp"
+#include "util.hpp"
+
+namespace dpct {
+
+enum class image_channel_data_type {
+  signed_int,
+  unsigned_int,
+  fp,
+};
+
+class image_channel;
+class image_wrapper_base;
+namespace detail {
+/// Image object type traits, with accessor type and sampled data type defined.
+/// The data type of an image accessor must be one of sycl::int4, sycl::uint4,
+/// sycl::float4 and sycl::half4. The data type of accessors with 8bits/16bits
+/// channel width will be 32 bits. sycl::half is an exception.
+template <class T> struct image_trait {
+  using acc_data_t = sycl::vec<T, 4>;
+  template <int dimensions>
+  using accessor_t =
+      sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
+                         sycl::access::target::image>;
+  template <int dimensions>
+  using array_accessor_t =
+      sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
+                         sycl::access::target::image_array>;
+  using data_t = T;
+  using elem_t = T;
+  static constexpr image_channel_data_type data_type =
+      std::is_integral<T>::value
+          ? (std::is_signed<T>::value ? image_channel_data_type::signed_int
+                                      : image_channel_data_type::unsigned_int)
+          : image_channel_data_type::fp;
+  static constexpr int channel_num = 1;
+};
+template <>
+struct image_trait<std::uint8_t> : public image_trait<std::uint32_t> {
+  using data_t = std::uint8_t;
+  using elem_t = data_t;
+};
+template <>
+struct image_trait<std::uint16_t>
+    : public image_trait<std::uint32_t> {
+  using data_t = std::uint16_t;
+  using elem_t = data_t;
+};
+template <>
+struct image_trait<std::int8_t> : public image_trait<std::int32_t> {
+  using data_t = std::int8_t;
+  using elem_t = data_t;
+};
+template <>
+struct image_trait<std::int16_t> : public image_trait<std::int32_t> {
+  using data_t = std::int16_t;
+  using elem_t = data_t;
+};
+template <>
+struct image_trait<char>
+    : public image_trait<typename std::conditional<
+          std::is_signed<char>::value, signed char, unsigned char>::type> {};
+
+template <class T>
+struct image_trait<sycl::vec<T, 1>> : public image_trait<T> {};
+
+template <class T>
+struct image_trait<sycl::vec<T, 2>> : public image_trait<T> {
+  using data_t = sycl::vec<T, 2>;
+  static constexpr int channel_num = 2;
+};
+
+template <class T>
+struct image_trait<sycl::vec<T, 3>>
+    : public image_trait<sycl::vec<T, 4>> {
+  static constexpr int channel_num = 3;
+};
+
+template <class T>
+struct image_trait<sycl::vec<T, 4>> : public image_trait<T> {
+  using data_t = sycl::vec<T, 4>;
+  static constexpr int channel_num = 4;
+};
+
+/// Functor to fetch data from read result of an image accessor.
+template <class T> struct fetch_data {
+  using return_t = typename image_trait<T>::data_t;
+  using acc_data_t = typename image_trait<T>::acc_data_t;
+
+  return_t operator()(acc_data_t &&original_data) {
+    return (return_t)original_data.r();
+  }
+};
+template <class T>
+struct fetch_data<sycl::vec<T, 1>> : public fetch_data<T> {};
+template <class T> struct fetch_data<sycl::vec<T, 2>> {
+  using return_t = typename image_trait<sycl::vec<T, 2>>::data_t;
+  using acc_data_t = typename image_trait<sycl::vec<T, 2>>::acc_data_t;
+
+  return_t operator()(acc_data_t &&origin_data) {
+    return return_t(origin_data.r(), origin_data.g());
+  }
+};
+template <class T>
+struct fetch_data<sycl::vec<T, 3>>
+    : public fetch_data<sycl::vec<T, 4>> {};
+template <class T> struct fetch_data<sycl::vec<T, 4>> {
+  using return_t = typename image_trait<sycl::vec<T, 4>>::data_t;
+  using acc_data_t = typename image_trait<sycl::vec<T, 4>>::acc_data_t;
+
+  return_t operator()(acc_data_t &&origin_data) {
+    return return_t(origin_data.r(), origin_data.g(), origin_data.b(),
+                    origin_data.a());
+  }
+};
+
+/// Create image according with given type \p T and \p dims.
+template <class T> static image_wrapper_base *create_image_wrapper(int dims);
+
+/// Create image with given data type \p T, channel order and dims
+template <class T>
+static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims);
+
+/// Create image with channel info and specified dimensions.
+static image_wrapper_base *create_image_wrapper(image_channel channel, int dims);
+
+} // namespace detail
+
+/// Image channel info, include channel number, order, data width and type
+class image_channel {
+  image_channel_data_type _type = image_channel_data_type::signed_int;
+  /// Number of channels.
+  unsigned _channel_num = 0;
+  /// Total size of all channels in bytes.
+  unsigned _total_size = 0;
+  /// Size of each channel in bytes.
+  unsigned _channel_size = 0;
+
+public:
+  /// Create image channel info according to template argument \p T.
+  template <class T> static image_channel create() {
+    image_channel channel;
+    channel.set_channel_size(detail::image_trait<T>::channel_num,
+                             sizeof(typename detail::image_trait<T>::elem_t) *
+                                 8);
+    channel.set_channel_data_type(detail::image_trait<T>::data_type);
+    return channel;
+  }
+
+  image_channel() = default;
+
+  image_channel_data_type get_channel_data_type() { return _type; }
+  void set_channel_data_type(image_channel_data_type type) { _type = type; }
+
+  unsigned get_total_size() { return _total_size; }
+
+  unsigned get_channel_num() { return _channel_num; }
+  void set_channel_num(unsigned channel_num) {
+    _channel_num = channel_num;
+    _total_size = _channel_size * _channel_num;
+  }
+
+  /// image_channel constructor.
+  /// \param r Channel r width in bits.
+  /// \param g Channel g width in bits. Should be same with \p r, or zero.
+  /// \param b Channel b width in bits. Should be same with \p g, or zero.
+  /// \param a Channel a width in bits. Should be same with \p b, or zero.
+  /// \param data_type Image channel data type: signed_nt, unsigned_int or fp.
+  image_channel(int r, int g, int b, int a, image_channel_data_type data_type) {
+    _type = data_type;
+    if (a) {
+      assert(r == a && "SYCL doesn't support different channel size");
+      assert(r == b && "SYCL doesn't support different channel size");
+      assert(r == g && "SYCL doesn't support different channel size");
+      set_channel_size(4, a);
+    } else if (b) {
+      assert(r == b && "SYCL doesn't support different channel size");
+      assert(r == g && "SYCL doesn't support different channel size");
+      set_channel_size(3, b);
+    } else if (g) {
+      assert(r == g && "SYCL doesn't support different channel size");
+      set_channel_size(2, g);
+    } else {
+      set_channel_size(1, r);
+    }
+  }
+
+  sycl::image_channel_type get_channel_type() const {
+    if (_channel_size == 4) {
+      if (_type == image_channel_data_type::signed_int)
+        return sycl::image_channel_type::signed_int32;
+      else if (_type == image_channel_data_type::unsigned_int)
+        return sycl::image_channel_type::unsigned_int32;
+      else if (_type == image_channel_data_type::fp)
+        return sycl::image_channel_type::fp32;
+    } else if (_channel_size == 2) {
+      if (_type == image_channel_data_type::signed_int)
+        return sycl::image_channel_type::signed_int16;
+      else if (_type == image_channel_data_type::unsigned_int)
+        return sycl::image_channel_type::unsigned_int16;
+      else if (_type == image_channel_data_type::fp)
+        return sycl::image_channel_type::fp16;
+    } else {
+      if (_type == image_channel_data_type::signed_int)
+        return sycl::image_channel_type::signed_int8;
+      else if (_type == image_channel_data_type::unsigned_int)
+        return sycl::image_channel_type::unsigned_int8;
+    }
+    assert(false && "unexpected channel data kind and channel size");
+    return sycl::image_channel_type::signed_int32;
+  }
+  void set_channel_type(sycl::image_channel_type type) {
+    switch (type) {
+    case sycl::image_channel_type::unsigned_int8:
+      _type = image_channel_data_type::unsigned_int;
+      _channel_size = 1;
+      break;
+    case sycl::image_channel_type::unsigned_int16:
+      _type = image_channel_data_type::unsigned_int;
+      _channel_size = 2;
+      break;
+    case sycl::image_channel_type::unsigned_int32:
+      _type = image_channel_data_type::unsigned_int;
+      _channel_size = 4;
+      break;
+    case sycl::image_channel_type::signed_int8:
+      _type = image_channel_data_type::signed_int;
+      _channel_size = 1;
+      break;
+    case sycl::image_channel_type::signed_int16:
+      _type = image_channel_data_type::signed_int;
+      _channel_size = 2;
+      break;
+    case sycl::image_channel_type::signed_int32:
+      _type = image_channel_data_type::signed_int;
+      _channel_size = 4;
+      break;
+    case sycl::image_channel_type::fp16:
+      _type = image_channel_data_type::fp;
+      _channel_size = 2;
+      break;
+    case sycl::image_channel_type::fp32:
+      _type = image_channel_data_type::fp;
+      _channel_size = 4;
+      break;
+    default:
+      break;
+    }
+    _total_size = _channel_size * _channel_num;
+  }
+
+  sycl::image_channel_order get_channel_order() const {
+    switch (_channel_num) {
+    case 1:
+      return sycl::image_channel_order::r;
+    case 2:
+      return sycl::image_channel_order::rg;
+    case 3:
+      return sycl::image_channel_order::rgb;
+    case 4:
+      return sycl::image_channel_order::rgba;
+    default:
+      return sycl::image_channel_order::r;
+    }
+  }
+  /// Get the size for each channel in bits.
+  unsigned get_channel_size() const { return _channel_size * 8; }
+
+  /// Set channel size.
+  /// \param in_channel_num Channels number to set.
+  /// \param channel_size Size for each channel in bits.
+  void set_channel_size(unsigned in_channel_num,
+                        unsigned channel_size) {
+    if (in_channel_num < _channel_num)
+      return;
+    _channel_num = in_channel_num;
+    _channel_size = channel_size / 8;
+    _total_size = _channel_size * _channel_num;
+  }
+};
+
+/// 2D or 3D matrix data for image.
+class image_matrix {
+  image_channel _channel;
+  int _range[3] = {1, 1, 1};
+  int _dims = 0;
+  void *_host_data = nullptr;
+
+  /// Set range of each dimension.
+  template <int dimensions> void set_range(sycl::range<dimensions> range) {
+    for (int i = 0; i < dimensions; ++i)
+      _range[i] = range[i];
+    _dims = dimensions;
+  }
+
+  template <int... DimIdx>
+  sycl::range<sizeof...(DimIdx)> get_range(integer_sequence<DimIdx...>) {
+    return sycl::range<sizeof...(DimIdx)>(_range[DimIdx]...);
+  }
+
+public:
+  /// Constructor with channel info and dimension size info.
+  template <int dimensions>
+  image_matrix(image_channel channel, sycl::range<dimensions> range)
+      : _channel(channel) {
+    set_range(range);
+    _host_data = std::malloc(range.size() * _channel.get_total_size());
+  }
+  image_matrix(sycl::image_channel_type channel_type, unsigned channel_num,
+               size_t x, size_t y) {
+    _channel.set_channel_type(channel_type);
+    _channel.set_channel_num(channel_num);
+    _dims = 1;
+    _range[0] = x;
+    if (y) {
+      _dims = 2;
+      _range[1] = y;
+    }
+    _host_data = std::malloc(_range[0] * _range[1] * _channel.get_total_size());
+  }
+
+  /// Construct a new image class with the matrix data.
+  template <int dimensions> sycl::image<dimensions> *create_image() {
+    return create_image<dimensions>(_channel);
+  }
+  /// Construct a new image class with the matrix data.
+  template <int dimensions>
+  sycl::image<dimensions> *create_image(image_channel channel) {
+    return new sycl::image<dimensions>(
+        _host_data, channel.get_channel_order(), channel.get_channel_type(),
+        get_range(make_index_sequence<dimensions>()),
+        sycl::property::image::use_host_ptr());
+  }
+
+  /// Get channel info.
+  inline image_channel get_channel() { return _channel; }
+  /// Get range of the image.
+  sycl::range<3> get_range() {
+    return sycl::range<3>(_range[0], _range[1], _range[2]);
+  }
+  /// Get matrix dims.
+  inline int get_dims() { return _dims; }
+  /// Convert to pitched data.
+  pitched_data to_pitched_data() {
+    return pitched_data(_host_data, _range[0] * _channel.get_total_size(),
+                        _range[0], _range[1]);
+  }
+
+  ~image_matrix() {
+    if (_host_data)
+      std::free(_host_data);
+    _host_data = nullptr;
+  }
+};
+using image_matrix_p = image_matrix *;
+
+enum class image_data_type { matrix, linear, pitch, unsupport };
+
+/// Image data info.
+class image_data {
+public:
+  image_data() { _type = image_data_type::unsupport; }
+  image_data(image_matrix_p matrix_data) { set_data(matrix_data); }
+  image_data(void *data_ptr, size_t x_size, image_channel channel) {
+    set_data(data_ptr, x_size, channel);
+  }
+  image_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
+             image_channel channel) {
+    set_data(data_ptr, x_size, y_size, pitch_size, channel);
+  }
+  void set_data(image_matrix_p matrix_data) {
+    _type = image_data_type::matrix;
+    _data = matrix_data;
+    _channel = matrix_data->get_channel();
+  }
+  void set_data(void *data_ptr, size_t x_size, image_channel channel) {
+    _type = image_data_type::linear;
+    _data = data_ptr;
+    _x = x_size;
+    _channel = channel;
+  }
+  void set_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
+                image_channel channel) {
+    _type = image_data_type::pitch;
+    _data = data_ptr;
+    _x = x_size;
+    _y = y_size;
+    _pitch = pitch_size;
+    _channel = channel;
+  }
+
+  image_data_type get_data_type() const { return _type; }
+  void set_data_type(image_data_type type) { _type = type; }
+
+  void *get_data_ptr() const { return _data; }
+  void set_data_ptr(void *data) { _data = data; }
+
+  size_t get_x() const { return _x; }
+  void set_x(size_t x) { _x = x; }
+
+  size_t get_y() const { return _y; }
+  void set_y(size_t y) { _y = y; }
+
+  size_t get_pitch() const { return _pitch; }
+  void set_pitch(size_t pitch) { _pitch = pitch; }
+
+  image_channel get_channel() const { return _channel; }
+  void set_channel(image_channel channel) { _channel = channel; }
+
+  image_channel_data_type get_channel_data_type() {
+    return _channel.get_channel_data_type();
+  }
+  void set_channel_data_type(image_channel_data_type type) {
+    _channel.set_channel_data_type(type);
+  }
+
+  unsigned get_channel_size() { return _channel.get_channel_size(); }
+  void set_channel_size(unsigned channel_num, unsigned channel_size) {
+    return _channel.set_channel_size(channel_num, channel_size);
+  }
+
+  unsigned get_channel_num() { return _channel.get_channel_num(); }
+  void set_channel_num(unsigned num) {
+    return _channel.set_channel_num(num);
+  }
+
+  sycl::image_channel_type get_channel_type() {
+    return _channel.get_channel_type();
+  }
+  void set_channel_type(sycl::image_channel_type type) {
+    return _channel.set_channel_type(type);
+  }
+
+private:
+  image_data_type _type;
+  void *_data = nullptr;
+  size_t _x, _y, _pitch;
+  image_channel _channel;
+};
+
+/// Image sampling info, include addressing mode, filtering mode and
+/// normalization info.
+class sampling_info {
+  sycl::addressing_mode _addressing_mode =
+      sycl::addressing_mode::clamp_to_edge;
+  sycl::filtering_mode _filtering_mode = sycl::filtering_mode::nearest;
+  sycl::coordinate_normalization_mode _coordinate_normalization_mode =
+      sycl::coordinate_normalization_mode::unnormalized;
+
+public:
+  sycl::addressing_mode get_addressing_mode() { return _addressing_mode; }
+  void set(sycl::addressing_mode addressing_mode) { _addressing_mode = addressing_mode; }
+
+  sycl::filtering_mode get_filtering_mode() { return _filtering_mode; }
+  void set(sycl::filtering_mode filtering_mode) { _filtering_mode = filtering_mode; }
+
+  sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
+    return _coordinate_normalization_mode;
+  }
+  void set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
+    _coordinate_normalization_mode = coordinate_normalization_mode;
+  }
+
+  bool is_coordinate_normalized() {
+    return _coordinate_normalization_mode ==
+           sycl::coordinate_normalization_mode::normalized;
+  }
+  void set_coordinate_normalization_mode(int is_normalized) {
+    _coordinate_normalization_mode =
+        is_normalized ? sycl::coordinate_normalization_mode::normalized
+                      : sycl::coordinate_normalization_mode::unnormalized;
+  }
+  void
+  set(sycl::addressing_mode addressing_mode,
+      sycl::filtering_mode filtering_mode,
+      sycl::coordinate_normalization_mode coordinate_normalization_mode) {
+    set(addressing_mode);
+    set(filtering_mode);
+    set(coordinate_normalization_mode);
+  }
+  void set(sycl::addressing_mode addressing_mode,
+           sycl::filtering_mode filtering_mode, int is_normalized) {
+    set(addressing_mode);
+    set(filtering_mode);
+    set_coordinate_normalization_mode(is_normalized);
+  }
+
+  sycl::sampler get_sampler() {
+    return sycl::sampler(_coordinate_normalization_mode, _addressing_mode,
+                             _filtering_mode);
+  }
+};
+
+/// Image base class.
+class image_wrapper_base {
+  sampling_info _sampling_info;
+  image_data _data;
+
+public:
+  virtual ~image_wrapper_base() = 0;
+
+  void attach(image_data data) { set_data(data); }
+  /// Attach matrix data to this class.
+  void attach(image_matrix *matrix) {
+    detach();
+    image_wrapper_base::set_data(image_data(matrix));
+  }
+  /// Attach matrix data to this class.
+  void attach(image_matrix *matrix, image_channel channel) {
+    attach(matrix);
+    image_wrapper_base::set_channel(channel);
+  }
+  /// Attach linear data to this class.
+  void attach(const void *ptr, size_t count) {
+    attach(ptr, count, get_channel());
+  }
+  /// Attach linear data to this class.
+  void attach(const void *ptr, size_t count, image_channel channel) {
+    detach();
+    image_wrapper_base::set_data(image_data(const_cast<void *>(ptr), count, channel));
+  }
+  /// Attach 2D data to this class.
+  void attach(const void *data, size_t x, size_t y, size_t pitch) {
+    attach(data, x, y, pitch, get_channel());
+  }
+  /// Attach 2D data to this class.
+  void attach(const void *data, size_t x, size_t y, size_t pitch,
+              image_channel channel) {
+    detach();
+    image_wrapper_base::set_data(
+        image_data(const_cast<void *>(data), x, y, pitch, channel));
+  }
+  /// Detach data.
+  virtual void detach() {}
+
+  sampling_info get_sampling_info() { return _sampling_info; }
+  void set_sampling_info(sampling_info info) {
+    _sampling_info = info;
+  }
+  const image_data &get_data() { return _data; }
+  void set_data(image_data data) { _data = data; }
+
+  image_channel get_channel() { return _data.get_channel(); }
+  void set_channel(image_channel channel) { _data.set_channel(channel); }
+
+  image_channel_data_type get_channel_data_type() {
+    return _data.get_channel_data_type();
+  }
+  void set_channel_data_type(image_channel_data_type type) {
+    _data.set_channel_data_type(type);
+  }
+
+  unsigned get_channel_size() { return _data.get_channel_size(); }
+  void set_channel_size(unsigned channel_num, unsigned channel_size) {
+    return _data.set_channel_size(channel_num, channel_size);
+  }
+
+  sycl::addressing_mode get_addressing_mode() {
+    return _sampling_info.get_addressing_mode();
+  }
+  void set(sycl::addressing_mode addressing_mode) {
+    _sampling_info.set(addressing_mode);
+  }
+
+  sycl::filtering_mode get_filtering_mode() {
+    return _sampling_info.get_filtering_mode();
+  }
+  void set(sycl::filtering_mode filtering_mode) {
+    _sampling_info.set(filtering_mode);
+  }
+
+  sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
+    return _sampling_info.get_coordinate_normalization_mode();
+  }
+  void
+  set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
+    _sampling_info.set(coordinate_normalization_mode);
+  }
+
+  bool is_coordinate_normalized() {
+    return _sampling_info.is_coordinate_normalized();
+  }
+  void set_coordinate_normalization_mode(int is_normalized) {
+    _sampling_info.set_coordinate_normalization_mode(is_normalized);
+  }
+  void
+  set(sycl::addressing_mode addressing_mode,
+      sycl::filtering_mode filtering_mode,
+      sycl::coordinate_normalization_mode coordinate_normalization_mode) {
+    set(addressing_mode);
+    set(filtering_mode);
+    set(coordinate_normalization_mode);
+  }
+  void set(sycl::addressing_mode addressing_mode,
+           sycl::filtering_mode filtering_mode, int is_normalized) {
+    set(addressing_mode);
+    set(filtering_mode);
+    set_coordinate_normalization_mode(is_normalized);
+  }
+
+  unsigned get_channel_num() { return _data.get_channel_num(); }
+  void set_channel_num(unsigned num) {
+    return _data.set_channel_num(num);
+  }
+
+  sycl::image_channel_type get_channel_type() {
+    return _data.get_channel_type();
+  }
+  void set_channel_type(sycl::image_channel_type type) {
+    return _data.set_channel_type(type);
+  }
+
+  sycl::sampler get_sampler() {
+    sycl::sampler smp = _sampling_info.get_sampler();
+    /// linear memory only used for sycl::filtering_mode::nearest.
+    if (_data.get_data_type() == image_data_type::linear) {
+      smp = sycl::sampler(smp.get_coordinate_normalization_mode(),
+                          smp.get_addressing_mode(),
+                          sycl::filtering_mode::nearest);
+    }
+    return smp;
+  }
+};
+inline image_wrapper_base::~image_wrapper_base() {}
+using image_wrapper_base_p = image_wrapper_base *;
+
+template <class T, int dimensions, bool IsImageArray> class image_accessor_ext;
+
+/// Image class, wrapper of sycl::image.
+template <class T, int dimensions, bool IsImageArray = false> class image_wrapper : public image_wrapper_base {
+  sycl::image<dimensions> *_image = nullptr;
+
+#ifndef DPCT_USM_LEVEL_NONE
+  std::vector<char> _host_buffer;
+#endif
+
+  void create_image(sycl::queue q) {
+    auto &data = get_data();
+    if (data.get_data_type() == image_data_type::matrix) {
+      _image = static_cast<image_matrix_p>(data.get_data_ptr())
+          ->create_image<dimensions>(data.get_channel());
+      return;
+    }
+    auto ptr = data.get_data_ptr();
+    auto channel = data.get_channel();
+
+    if (detail::get_pointer_attribute(q, ptr) == detail::pointer_access_attribute::device_only) {
+#ifdef DPCT_USM_LEVEL_NONE
+      ptr = get_buffer(ptr)
+                .template get_access<sycl::access_mode::read_write>()
+                .get_pointer();
+#else
+      auto sz = data.get_x();
+      if (data.get_data_type() == image_data_type::pitch)
+        sz *= channel.get_total_size() * data.get_y();
+      _host_buffer.resize(sz);
+      q.memcpy(_host_buffer.data(), ptr, sz).wait();
+      ptr = _host_buffer.data();
+#endif
+    }
+
+    if constexpr (dimensions == 1) {
+      assert(data.get_data_type() == image_data_type::linear);
+      _image = new sycl::image<1>(
+        ptr, channel.get_channel_order(), channel.get_channel_type(),
+        sycl::range<1>(data.get_x() / channel.get_total_size()));
+    } else if constexpr (dimensions == 2) {
+      assert(data.get_data_type() == image_data_type::pitch);
+      _image = new sycl::image<2>(ptr, channel.get_channel_order(),
+                                  channel.get_channel_type(),
+                                  sycl::range<2>(data.get_x(), data.get_y()),
+                                  sycl::range<1>(data.get_pitch()));
+    } else {
+      throw std::runtime_error("3D image only support matrix data");
+    }
+    return;
+  }
+
+public:
+  using acc_data_t = typename detail::image_trait<T>::acc_data_t;
+  using accessor_t =
+      typename image_accessor_ext<T, IsImageArray ? (dimensions - 1) : dimensions,
+                              IsImageArray>::accessor_t;
+
+  image_wrapper() { set_channel(image_channel::create<T>()); }
+  ~image_wrapper() { detach(); }
+
+  /// Get image accessor.
+  accessor_t get_access(sycl::handler &cgh, sycl::queue &q = get_default_queue()) {
+    if (!_image)
+      create_image(q);
+    return accessor_t(*_image, cgh);
+  }
+
+  /// Detach data.
+  void detach() override {
+    if (_image)
+      delete _image;
+    _image = nullptr;
+  }
+};
+
+/// Wrap sampler and image accessor together.
+template <class T, int dimensions, bool IsImageArray = false>
+class image_accessor_ext {
+public:
+  using accessor_t =
+      typename detail::image_trait<T>::template accessor_t<dimensions>;
+  using data_t = typename detail::image_trait<T>::data_t;
+  sycl::sampler _sampler;
+  accessor_t _img_acc;
+
+public:
+  image_accessor_ext(sycl::sampler sampler, accessor_t acc)
+      : _sampler(sampler), _img_acc(acc) {}
+
+  /// Read data from accessor.
+  template <bool Available = dimensions == 3>
+  typename std::enable_if<Available, data_t>::type read(float x, float y,
+                                                        float z) {
+    return detail::fetch_data<T>()(
+        _img_acc.read(sycl::float4(x, y, z, 0), _sampler));
+  }
+  /// Read data from accessor.
+  template <class Coord0, class Coord1, class Coord2,
+            bool Available = dimensions == 3 &&
+                             std::is_integral<Coord0>::value
+                                 &&std::is_integral<Coord1>::value
+                                     &&std::is_integral<Coord2>::value>
+  typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y,
+                                                        Coord2 z) {
+    return detail::fetch_data<T>()(
+        _img_acc.read(sycl::int4(x, y, z, 0), _sampler));
+  }
+  /// Read data from accessor.
+  template <bool Available = dimensions == 2>
+  typename std::enable_if<Available, data_t>::type read(float x, float y) {
+    return detail::fetch_data<T>()(
+        _img_acc.read(sycl::float2(x, y), _sampler));
+  }
+  /// Read data from accessor.
+  template <class Coord0, class Coord1,
+            bool Available = dimensions == 2 &&
+                             std::is_integral<Coord0>::value
+                                 &&std::is_integral<Coord1>::value>
+  typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y) {
+    return detail::fetch_data<T>()(
+        _img_acc.read(sycl::int2(x, y), _sampler));
+  }
+  /// Read data from accessor.
+  template <bool Available = dimensions == 1>
+  typename std::enable_if<Available, data_t>::type read(float x) {
+    return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
+  }
+  /// Read data from accessor.
+  template <class CoordT,
+            bool Available = dimensions == 1 && std::is_integral<CoordT>::value>
+  typename std::enable_if<Available, data_t>::type read(CoordT x) {
+    return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
+  }
+};
+
+template <class T, int dimensions> class image_accessor_ext<T, dimensions, true> {
+public:
+  using accessor_t =
+      typename detail::image_trait<T>::template array_accessor_t<dimensions>;
+  using data_t = typename detail::image_trait<T>::data_t;
+  sycl::sampler _sampler;
+  accessor_t _img_acc;
+
+public:
+  image_accessor_ext(sycl::sampler sampler, accessor_t acc)
+      : _sampler(sampler), _img_acc(acc) {}
+
+  /// Read data from accessor.
+  template <bool Available = dimensions == 2>
+  typename std::enable_if<Available, data_t>::type read(int index, float x,
+                                                        float y) {
+    return detail::fetch_data<T>()(
+        _img_acc[index].read(sycl::float2(x, y), _sampler));
+  }
+  /// Read data from accessor.
+  template <bool Available = dimensions == 2>
+  typename std::enable_if<Available, data_t>::type read(int index, int x, int y) {
+    return detail::fetch_data<T>()(
+        _img_acc[index].read(sycl::int2(x, y), _sampler));
+  }
+  /// Read data from accessor.
+  template <bool Available = dimensions == 1>
+  typename std::enable_if<Available, data_t>::type read(int index, float x) {
+    return detail::fetch_data<T>()(
+        _img_acc[index].read(x, _sampler));
+  }
+  /// Read data from accessor.
+  template <bool Available = dimensions == 1>
+  typename std::enable_if<Available, data_t>::type read(int index, int x) {
+    return detail::fetch_data<T>()(
+        _img_acc[index].read(x, _sampler));
+  }
+};
+
+/// Create image wrapper according to image data and sampling info.
+/// \return Pointer to image wrapper base class.
+/// \param data Image data used to create image wrapper.
+/// \param info Image sampling info used to create image wrapper.
+/// \returns Pointer to base class of created image wrapper object.
+static inline image_wrapper_base *create_image_wrapper(image_data data,
+                              sampling_info info) {
+  image_channel channel;
+  int dims = 1;
+  if (data.get_data_type() == image_data_type::matrix) {
+    auto matrix = (image_matrix_p)data.get_data_ptr();
+    channel = matrix->get_channel();
+    dims = matrix->get_dims();
+  } else {
+    if (data.get_data_type() == image_data_type::pitch) {
+      dims = 2;
+    }
+    channel = data.get_channel();
+  }
+
+  if (auto ret = detail::create_image_wrapper(channel, dims)) {
+    ret->set_sampling_info(info);
+    ret->set_data(data);
+    return ret;
+  }
+  return nullptr;
+}
+
+namespace detail {
+/// Create image according with given type \p T and \p dims.
+template <class T> static image_wrapper_base *create_image_wrapper(int dims) {
+  switch (dims) {
+  case 1:
+    return new image_wrapper<T, 1>();
+  case 2:
+    return new image_wrapper<T, 2>();
+  case 3:
+    return new image_wrapper<T, 3>();
+  default:
+    return nullptr;
+  }
+}
+/// Create image with given data type \p T, channel order and dims
+template <class T>
+static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims) {
+  switch (channel_num) {
+  case 1:
+    return create_image_wrapper<T>(dims);
+  case 2:
+    return create_image_wrapper<sycl::vec<T, 2>>(dims);
+  case 3:
+    return create_image_wrapper<sycl::vec<T, 3>>(dims);
+  case 4:
+    return create_image_wrapper<sycl::vec<T, 4>>(dims);
+  default:
+    return nullptr;
+  }
+}
+
+/// Create image with channel info and specified dimensions.
+static image_wrapper_base *create_image_wrapper(image_channel channel, int dims) {
+  switch (channel.get_channel_type()) {
+  case sycl::image_channel_type::fp16:
+    return create_image_wrapper<sycl::half>(channel.get_channel_num(), dims);
+  case sycl::image_channel_type::fp32:
+    return create_image_wrapper<float>(channel.get_channel_num(), dims);
+  case sycl::image_channel_type::signed_int8:
+    return create_image_wrapper<std::int8_t>(channel.get_channel_num(), dims);
+  case sycl::image_channel_type::signed_int16:
+    return create_image_wrapper<std::int16_t>(channel.get_channel_num(), dims);
+  case sycl::image_channel_type::signed_int32:
+    return create_image_wrapper<std::int32_t>(channel.get_channel_num(), dims);
+  case sycl::image_channel_type::unsigned_int8:
+    return create_image_wrapper<std::uint8_t>(channel.get_channel_num(), dims);
+  case sycl::image_channel_type::unsigned_int16:
+    return create_image_wrapper<std::uint16_t>(channel.get_channel_num(), dims);
+  case sycl::image_channel_type::unsigned_int32:
+    return create_image_wrapper<std::uint32_t>(channel.get_channel_num(), dims);
+  default:
+    return nullptr;
+  }
+}
+} // namespace detail
+
+} // namespace dpct
+
+#endif // !__DPCT_IMAGE_HPP__
diff --git a/dpct/kernel.hpp b/dpct/kernel.hpp
new file mode 100644
index 0000000000000..11d1321bb4086
--- /dev/null
+++ b/dpct/kernel.hpp
@@ -0,0 +1,459 @@
+//==---- kernel.hpp -------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_KERNEL_HPP__
+#define __DPCT_KERNEL_HPP__
+
+#include <sycl/sycl.hpp>
+#ifdef _WIN32
+#include <unordered_set>
+#include <windows.h>
+#else
+#include <dlfcn.h>
+#endif
+
+#if defined(__has_include) && __has_include(<filesystem>)
+#include <filesystem>
+#elif defined(__has_include) && __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+#else
+#error "SYCLomatic runtime requires C++ filesystem support"
+#endif
+
+#include <fstream>
+#include <image.hpp>
+#include <random>
+
+namespace dpct {
+
+typedef void (*kernel_functor)(sycl::queue &, const sycl::nd_range<3> &,
+                               unsigned int, void **, void **);
+
+struct kernel_function_info {
+  int max_work_group_size = 0;
+};
+
+static inline void get_kernel_function_info(kernel_function_info *kernel_info,
+                                            const void *function) {
+  kernel_info->max_work_group_size =
+      dpct::dev_mgr::instance()
+          .current_device()
+          .get_info<sycl::info::device::max_work_group_size>();
+}
+static inline kernel_function_info
+get_kernel_function_info(const void *function) {
+  kernel_function_info kernel_info;
+  kernel_info.max_work_group_size =
+      dpct::dev_mgr::instance()
+          .current_device()
+          .get_info<sycl::info::device::max_work_group_size>();
+  return kernel_info;
+}
+
+
+namespace detail {
+
+#if defined(__has_include) && __has_include(<filesystem>)
+namespace fs = std::filesystem;
+#else
+namespace fs = std::experimental::filesystem;
+#endif
+
+/// Write data to temporary file and return absolute path to temporary file.
+/// Temporary file is created in a temporary directory both of which have random
+/// names with only the user having access permissions.  Only one temporary file
+/// will be created in the temporary directory.
+static inline fs::path write_data_to_file(char const *const data, size_t size) {
+  std::error_code ec;
+
+  if (sizeof(size_t) >= sizeof(std::streamsize) &&
+      size > (std::numeric_limits<std::streamsize>::max)())
+    throw std::runtime_error("data file too large");
+
+  // random number generator
+  std::random_device dev;
+  std::mt19937 prng(dev());
+  std::uniform_int_distribution<uint64_t> rand(0);
+
+  // find temporary directory
+  auto tmp_dir = fs::temp_directory_path(ec);
+  if (ec)
+    throw std::runtime_error("could not find temporary directory");
+
+  // create private directory
+  std::stringstream directory;
+  fs::path directory_path;
+  constexpr int max_attempts = 5;
+  int i;
+
+  for (i = 0; i < max_attempts; i++) {
+    directory << std::hex << rand(prng);
+    directory_path = tmp_dir / directory.str();
+    if (fs::create_directory(directory_path)) {
+      break;
+    }
+  }
+  if (i == max_attempts)
+    throw std::runtime_error("could not create directory");
+
+  // only allow owner permissions to private directory
+  fs::permissions(directory_path, fs::perms::owner_all, ec);
+  if (ec)
+    throw std::runtime_error("could not set directory permissions");
+
+  // random filename in private directory
+  std::stringstream filename;
+  filename << std::hex << rand(prng);
+#ifdef _WIN32
+  auto filepath = directory_path / (filename.str() + ".dll");
+#else
+  auto filepath = directory_path / filename.str();
+#endif
+
+  // write data to temporary file
+  auto outfile = std::ofstream(filepath, std::ios::out | std::ios::binary);
+  if (outfile) {
+    // only allow program to write file
+    fs::permissions(filepath, fs::perms::owner_write, ec);
+    if (ec)
+      throw std::runtime_error("could not set permissions");
+
+    outfile.write(data, size);
+    if (!outfile.good())
+      throw std::runtime_error("could not write data");
+    outfile.close();
+
+    // only allow program to read/execute file
+    fs::permissions(filepath, fs::perms::owner_read | fs::perms::owner_exec,
+                    ec);
+    if (ec)
+      throw std::runtime_error("could not set permissions");
+  } else
+    throw std::runtime_error("could not write data");
+
+  // check temporary file contents
+  auto infile = std::ifstream(filepath, std::ios::in | std::ios::binary);
+  if (infile) {
+    bool mismatch = false;
+    size_t cnt = 0;
+
+    while (1) {
+      char c;
+      infile.get(c);
+      if (infile.eof())
+        break;
+      if (c != data[cnt++])
+        mismatch = true;
+    }
+    if (cnt != size || mismatch)
+      throw std::runtime_error("file contents not written correctly");
+  } else
+    throw std::runtime_error("could not validate file");
+
+  if (!filepath.is_absolute())
+    throw std::runtime_error("temporary filepath is not absolute");
+
+  return filepath;
+}
+
+static inline uint16_t extract16(unsigned char const *const ptr) {
+  uint16_t ret = 0;
+
+  ret |= static_cast<uint16_t>(ptr[0]) << 0;
+  ret |= static_cast<uint16_t>(ptr[1]) << 8;
+
+  return (ret);
+}
+
+static inline uint32_t extract32(unsigned char const *const ptr) {
+  uint32_t ret = 0;
+
+  ret |= static_cast<uint32_t>(ptr[0]) << 0;
+  ret |= static_cast<uint32_t>(ptr[1]) << 8;
+  ret |= static_cast<uint32_t>(ptr[2]) << 16;
+  ret |= static_cast<uint32_t>(ptr[3]) << 24;
+
+  return (ret);
+}
+
+static inline uint64_t extract64(unsigned char const *const ptr) {
+  uint64_t ret = 0;
+
+  ret |= static_cast<uint64_t>(ptr[0]) << 0;
+  ret |= static_cast<uint64_t>(ptr[1]) << 8;
+  ret |= static_cast<uint64_t>(ptr[2]) << 16;
+  ret |= static_cast<uint64_t>(ptr[3]) << 24;
+  ret |= static_cast<uint64_t>(ptr[4]) << 32;
+  ret |= static_cast<uint64_t>(ptr[5]) << 40;
+  ret |= static_cast<uint64_t>(ptr[6]) << 48;
+  ret |= static_cast<uint64_t>(ptr[7]) << 56;
+
+  return (ret);
+}
+
+static inline uint64_t get_lib_size(char const *const blob) {
+#ifdef _WIN32
+  ///////////////////////////////////////////////////////////////////////
+  // Analyze DOS stub
+  unsigned char const *const ublob =
+      reinterpret_cast<unsigned char const *const>(blob);
+  if (ublob[0] != 0x4d || ublob[1] != 0x5a) {
+    throw std::runtime_error("Blob is not a Windows DLL.");
+  }
+  uint32_t pe_header_offset = extract32(ublob + 0x3c);
+
+  ///////////////////////////////////////////////////////////////////////
+  // Ananlyze PE-header
+  unsigned char const *const pe_header = ublob + pe_header_offset;
+
+  // signature
+  uint32_t pe_signature = extract32(pe_header + 0);
+  if (pe_signature != 0x00004550) {
+    throw std::runtime_error("PE-header signature is not 0x00004550");
+  }
+
+  // machine
+  uint16_t machine = extract16(pe_header + 4);
+  if (machine != 0x8664) {
+    throw std::runtime_error("Only DLLs for x64 supported");
+  }
+
+  // number of sections
+  uint16_t number_of_sections = extract16(pe_header + 6);
+
+  // sizeof optional header
+  uint16_t sizeof_optional_header = extract16(pe_header + 20);
+
+  // magic
+  uint16_t magic = extract16(pe_header + 24);
+  if (magic != 0x10b && magic != 0x20b) {
+    throw std::runtime_error("MAGIC is not 0x010b or 0x020b");
+  }
+
+  ///////////////////////////////////////////////////////////////////////
+  // Analyze tail of optional header
+  constexpr int coff_header_size = 24;
+
+  unsigned char const *const tail_of_optional_header =
+      pe_header + coff_header_size + sizeof_optional_header;
+  if (extract64(tail_of_optional_header - 8) != 0) {
+    throw std::runtime_error("Optional header not zero-padded");
+  }
+
+  ///////////////////////////////////////////////////////////////////////
+  // Analyze last section header
+  constexpr int section_header_size = 40;
+  unsigned char const *const last_section_header =
+      tail_of_optional_header + section_header_size * (number_of_sections - 1);
+
+  uint32_t sizeof_raw_data = extract32(last_section_header + 16);
+  uint32_t pointer_to_raw_data = extract32(last_section_header + 20);
+
+  return sizeof_raw_data + pointer_to_raw_data;
+#else
+  if (blob[0] != 0x7F || blob[1] != 'E' || blob[2] != 'L' || blob[3] != 'F')
+    throw std::runtime_error("Blob is not in ELF format");
+
+  if (blob[4] != 0x02)
+    throw std::runtime_error("Only 64-bit headers are supported");
+
+  if (blob[5] != 0x01)
+    throw std::runtime_error("Only little-endian headers are supported");
+
+  unsigned char const *const ublob =
+      reinterpret_cast<unsigned char const *const>(blob);
+  uint64_t e_shoff = extract64(ublob + 0x28);
+  uint16_t e_shentsize = extract16(ublob + 0x3A);
+  uint16_t e_shnum = extract16(ublob + 0x3C);
+
+  return e_shoff + (e_shentsize * e_shnum);
+#endif
+}
+
+#ifdef _WIN32
+class path_lib_record {
+public:
+  void operator=(const path_lib_record &) = delete;
+  ~path_lib_record() {
+    for (auto entry : lib_to_path) {
+      FreeLibrary(static_cast<HMODULE>(entry.first));
+      fs::permissions(entry.second, fs::perms::owner_all);
+      fs::remove_all(entry.second.remove_filename());
+    }
+  }
+  static void record_lib_path(fs::path path, void *library) {
+    lib_to_path[library] = path;
+  }
+  static void remove_lib(void *library) {
+    auto path = lib_to_path[library];
+    std::error_code ec;
+
+    FreeLibrary(static_cast<HMODULE>(library));
+    fs::permissions(path, fs::perms::owner_all);
+    if (fs::remove_all(path.remove_filename(), ec) != 2 || ec)
+      // one directory and one temporary file should have been deleted
+      throw std::runtime_error("Directory delete failed");
+
+    lib_to_path.erase(library);
+  }
+
+private:
+  static inline std::unordered_map<void *, fs::path> lib_to_path;
+};
+#endif
+
+} // namespace detail
+
+class kernel_library {
+public:
+  kernel_library() : ptr{nullptr} {}
+  kernel_library(void *ptr) : ptr{ptr} {}
+
+  operator void *() const { return ptr; }
+
+private:
+  void *ptr;
+#ifdef _WIN32
+  static inline detail::path_lib_record single_instance_to_trigger_destructor;
+#endif
+};
+
+namespace detail {
+
+static inline kernel_library load_dl_from_data(char const *const data,
+                                               size_t size) {
+  fs::path filename = write_data_to_file(data, size);
+#ifdef _WIN32
+  void *so = LoadLibraryW(filename.wstring().c_str());
+#else
+  void *so = dlopen(filename.c_str(), RTLD_LAZY);
+#endif
+  if (so == nullptr)
+    throw std::runtime_error("Failed to load kernel library");
+
+#ifdef _WIN32
+  detail::path_lib_record::record_lib_path(filename, so);
+#else
+  std::error_code ec;
+
+  // Windows DLL cannot be deleted while in use
+  if (fs::remove_all(filename.remove_filename(), ec) != 2 || ec)
+    // one directory and one temporary file should have been deleted
+    throw std::runtime_error("Directory delete failed");
+#endif
+
+  return so;
+}
+
+} // namespace detail
+
+/// Load kernel library and return a handle to use the library.
+/// \param [in] name The name of the library.
+static inline kernel_library load_kernel_library(const std::string &name) {
+  std::ifstream ifs;
+  ifs.open(name, std::ios::in | std::ios::binary);
+
+  std::stringstream buffer;
+  buffer << ifs.rdbuf();
+
+  const std::string buffer_string = buffer.str();
+  return detail::load_dl_from_data(buffer_string.c_str(), buffer_string.size());
+}
+
+/// Load kernel library whose image is alreay in memory and return a handle to
+/// use the library.
+/// \param [in] image A pointer to the image in memory.
+static inline kernel_library load_kernel_library_mem(char const *const image) {
+  const size_t size = detail::get_lib_size(image);
+
+  return detail::load_dl_from_data(image, size);
+}
+
+/// Unload kernel library.
+/// \param [in,out] library Handle to the library to be closed.
+static inline void unload_kernel_library(const kernel_library &library) {
+#ifdef _WIN32
+  detail::path_lib_record::remove_lib(library);
+#else
+  dlclose(library);
+#endif
+}
+
+class kernel_function {
+public:
+  kernel_function() : ptr{nullptr} {}
+  kernel_function(dpct::kernel_functor ptr) : ptr{ptr} {}
+
+  operator void *() const { return ((void *)ptr); }
+
+  void operator()(sycl::queue &q, const sycl::nd_range<3> &range,
+                  unsigned int a, void **args, void **extra) {
+    ptr(q, range, a, args, extra);
+  }
+
+private:
+  dpct::kernel_functor ptr;
+};
+
+/// Find kernel function in a kernel library and return its address.
+/// \param [in] library Handle to the kernel library.
+/// \param [in] name Name of the kernel function.
+static inline dpct::kernel_function
+get_kernel_function(kernel_library &library, const std::string &name) {
+#ifdef _WIN32
+  dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
+      GetProcAddress(static_cast<HMODULE>(static_cast<void *>(library)),
+                     (name + std::string("_wrapper")).c_str()));
+#else
+  dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
+      dlsym(library, (name + std::string("_wrapper")).c_str()));
+#endif
+  if (fn == nullptr)
+    throw std::runtime_error("Failed to get function");
+  return fn;
+}
+
+/// Invoke a kernel function.
+/// \param [in] function kernel function.
+/// \param [in] queue SYCL queue used to execute kernel
+/// \param [in] groupRange SYCL group range
+/// \param [in] localRange SYCL local range
+/// \param [in] localMemSize The size of local memory required by the kernel
+///             function.
+/// \param [in] kernelParams Array of pointers to kernel arguments.
+/// \param [in] extra Extra arguments.
+static inline void invoke_kernel_function(dpct::kernel_function &function,
+                                          sycl::queue &queue,
+                                          sycl::range<3> groupRange,
+                                          sycl::range<3> localRange,
+                                          unsigned int localMemSize,
+                                          void **kernelParams, void **extra) {
+  function(queue, sycl::nd_range<3>(groupRange * localRange, localRange),
+           localMemSize, kernelParams, extra);
+}
+
+/// Find image wrapper in a kernel library and return its address.
+/// \param [in] library Handle to the kernel library.
+/// \param [in] name Name of the target image wrapper.
+static inline dpct::image_wrapper_base_p
+get_image_wrapper(dpct::kernel_library &library, const std::string &name) {
+#ifdef _WIN32
+  dpct::image_wrapper_base_p fn =
+      reinterpret_cast<dpct::image_wrapper_base_p>(GetProcAddress(
+          static_cast<HMODULE>(static_cast<void *>(library)), name.c_str()));
+#else
+  dpct::image_wrapper_base_p fn = reinterpret_cast<dpct::image_wrapper_base_p>(
+      dlsym(library, name.c_str()));
+#endif
+  if (fn == nullptr)
+    throw std::runtime_error("Failed to get image");
+  return fn;
+}
+
+} // namespace dpct
+#endif // __DPCT_KERNEL_HPP__
diff --git a/dpct/lapack_utils.hpp b/dpct/lapack_utils.hpp
new file mode 100644
index 0000000000000..dac77d5773ec4
--- /dev/null
+++ b/dpct/lapack_utils.hpp
@@ -0,0 +1,1953 @@
+//==---- lapack_utils.hpp -------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_LAPACK_UTILS_HPP__
+#define __DPCT_LAPACK_UTILS_HPP__
+
+#include "memory.hpp"
+#include "util.hpp"
+#include "lib_common_utils.hpp"
+
+#include <oneapi/mkl.hpp>
+#include <sycl/sycl.hpp>
+
+namespace dpct {
+namespace lapack {
+/// Computes all eigenvalues and, optionally, eigenvectors of a real generalized
+/// symmetric definite eigenproblem using a divide and conquer method.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] queue Device queue where calculations will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] itype Must be 1 or 2 or 3. Specifies the problem type to be solved.
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrices A and B.
+/// \param [in,out] a The symmetric matrix A.
+/// \param [in] lda The leading dimension of matrix A.
+/// \param [in,out] b The symmetric matrix B.
+/// \param [in] ldb The leading dimension of matrix B.
+/// \param [out] w Eigenvalues.
+/// \param [in] scratchpad Scratchpad memory to be used by the routine
+/// for storing intermediate results.
+/// \param [in] scratchpad_size Size of scratchpad memory as a number of
+/// floating point elements of type T.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+template <typename T>
+inline int sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
+                 oneapi::mkl::uplo uplo, int n, T *a, int lda, T *b, int ldb,
+                 T *w, T *scratchpad, int scratchpad_size, int *info) {
+#ifdef DPCT_USM_LEVEL_NONE
+  auto info_buf = get_buffer<int>(info);
+  auto a_buffer = get_buffer<T>(a);
+  auto b_buffer = get_buffer<T>(b);
+  auto w_buffer = get_buffer<T>(w);
+  auto scratchpad_buffer = get_buffer<T>(scratchpad);
+  int info_val = 0;
+  int ret_val = 0;
+  try {
+    oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a_buffer, lda,
+                               b_buffer, ldb, w_buffer, scratchpad_buffer,
+                               scratchpad_size);
+  } catch (oneapi::mkl::lapack::exception const& e) {
+    std::cerr << "Unexpected exception caught during call to LAPACK API: sygvd"
+              << std::endl
+              << "reason: " << e.what() << std::endl
+              << "info: " << e.info() << std::endl;
+    info_val = static_cast<int>(e.info());
+    ret_val = 1;
+  } catch (sycl::exception const& e) {
+    std::cerr << "Caught synchronous SYCL exception:" << std::endl
+              << "reason: " << e.what() << std::endl;
+    ret_val = 1;
+  }
+  queue.submit([&, info_val](sycl::handler &cgh) {
+    auto info_acc = info_buf.get_access<sycl::access_mode::write>(cgh);
+    cgh.single_task<dpct_kernel_name<class sygvd_set_info, T>>(
+        [=]() { info_acc[0] = info_val; });
+  });
+  return ret_val;
+#else
+  try {
+    oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
+                               scratchpad, scratchpad_size);
+  } catch (oneapi::mkl::lapack::exception const& e) {
+    std::cerr << "Unexpected exception caught during call to LAPACK API: sygvd"
+              << std::endl
+              << "reason: " << e.what() << std::endl
+              << "info: " << e.info() << std::endl;
+    int info_val = static_cast<int>(e.info());
+    queue.memcpy(info, &info_val, sizeof(int)).wait();
+    return 1;
+  } catch (sycl::exception const& e) {
+    std::cerr << "Caught synchronous SYCL exception:" << std::endl
+              << "reason: " << e.what() << std::endl;
+    queue.memset(info, 0, sizeof(int)).wait();
+    return 1;
+  }
+  queue.memset(info, 0, sizeof(int));
+  return 0;
+#endif
+}
+/// Computes all the eigenvalues, and optionally, the eigenvectors of a complex
+/// generalized Hermitian positive-definite eigenproblem using a divide and
+/// conquer method.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] queue Device queue where calculations will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] itype Must be 1 or 2 or 3. Specifies the problem type to be solved.
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrices A and B.
+/// \param [in,out] a The Hermitian matrix A.
+/// \param [in] lda The leading dimension of matrix A.
+/// \param [in,out] b The Hermitian matrix B.
+/// \param [in] ldb The leading dimension of matrix B.
+/// \param [in] w Eigenvalues.
+/// \param [in] scratchpad Scratchpad memory to be used by the routine
+/// for storing intermediate results.
+/// \param [in] scratchpad_size Size of scratchpad memory as a number of
+/// floating point elements of type T.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+template <typename T, typename Tw>
+inline int hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
+                 oneapi::mkl::uplo uplo, int n, T *a, int lda, T *b, int ldb,
+                 Tw *w, T *scratchpad, int scratchpad_size, int *info) {
+  using Ty = typename DataType<T>::T2;
+#ifdef DPCT_USM_LEVEL_NONE
+  auto info_buf = get_buffer<int>(info);
+  auto a_buffer = get_buffer<Ty>(a);
+  auto b_buffer = get_buffer<Ty>(b);
+  auto w_buffer = get_buffer<Tw>(w);
+  auto scratchpad_buffer = get_buffer<Ty>(scratchpad);
+  int info_val = 0;
+  int ret_val = 0;
+  try {
+    oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a_buffer, lda,
+                               b_buffer, ldb, w_buffer, scratchpad_buffer,
+                               scratchpad_size);
+  } catch (oneapi::mkl::lapack::exception const& e) {
+    std::cerr << "Unexpected exception caught during call to LAPACK API: hegvd"
+              << std::endl
+              << "reason: " << e.what() << std::endl
+              << "info: " << e.info() << std::endl;
+    info_val = static_cast<int>(e.info());
+    ret_val = 1;
+  } catch (sycl::exception const& e) {
+    std::cerr << "Caught synchronous SYCL exception:" << std::endl
+              << "reason: " << e.what() << std::endl;
+    ret_val = 1;
+  }
+  queue.submit([&, info_val](sycl::handler &cgh) {
+    auto info_acc = info_buf.get_access<sycl::access_mode::write>(cgh);
+    cgh.single_task<dpct_kernel_name<class hegvd_set_info, T>>(
+        [=]() { info_acc[0] = info_val; });
+  });
+  return ret_val;
+#else
+  try {
+    oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, (Ty *)a, lda, (Ty *)b,
+                               ldb, w, (Ty *)scratchpad, scratchpad_size);
+  } catch (oneapi::mkl::lapack::exception const& e) {
+    std::cerr << "Unexpected exception caught during call to LAPACK API: hegvd"
+              << std::endl
+              << "reason: " << e.what() << std::endl
+              << "info: " << e.info() << std::endl;
+    int info_val = static_cast<int>(e.info());
+    queue.memcpy(info, &info_val, sizeof(int)).wait();
+    return 1;
+  } catch (sycl::exception const& e) {
+    std::cerr << "Caught synchronous SYCL exception:" << std::endl
+              << "reason: " << e.what() << std::endl;
+    queue.memset(info, 0, sizeof(int)).wait();
+    return 1;
+  }
+  queue.memset(info, 0, sizeof(int));
+  return 0;
+#endif
+}
+/// Computes the Cholesky factorizations of a batch of symmetric (or Hermitian,
+/// for complex data) positive-definite matrices.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] queue Device queue where calculations will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in,out] a Array of pointers to matrix A.
+/// \param [in] lda The leading dimension of matrix A.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+/// \param [in] group_size The batch size.
+template <typename T>
+inline int potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, int n,
+                       T *a[], int lda, int *info, int group_size) {
+#ifdef DPCT_USM_LEVEL_NONE
+  throw std::runtime_error("this API is unsupported when USM level is none");
+#else
+  using Ty = typename DataType<T>::T2;
+  struct matrix_info_t {
+    oneapi::mkl::uplo uplo_info;
+    std::int64_t n_info;
+    std::int64_t lda_info;
+    std::int64_t group_size_info;
+  };
+  matrix_info_t *matrix_info =
+      (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
+  matrix_info->uplo_info = uplo;
+  matrix_info->n_info = n;
+  matrix_info->lda_info = lda;
+  matrix_info->group_size_info = group_size;
+  std::int64_t scratchpad_size = 0;
+  sycl::event e;
+  Ty *scratchpad = nullptr;
+  try {
+    scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<Ty>(
+        queue, &(matrix_info->uplo_info), &(matrix_info->n_info),
+        &(matrix_info->lda_info), 1, &(matrix_info->group_size_info));
+    scratchpad = sycl::malloc_device<Ty>(scratchpad_size, queue);
+    e = oneapi::mkl::lapack::potrf_batch(
+        queue, &(matrix_info->uplo_info), &(matrix_info->n_info), (Ty **)a,
+        &(matrix_info->lda_info), 1, &(matrix_info->group_size_info),
+        scratchpad, scratchpad_size);
+  } catch (oneapi::mkl::lapack::batch_error const &be) {
+    std::cerr << "Unexpected exception caught during call to LAPACK API: "
+                 "potrf_batch_scratchpad_size/potrf_batch"
+              << std::endl
+              << "reason: " << be.what() << std::endl
+              << "number: " << be.info() << std::endl;
+    int i = 0;
+    auto &ids = be.ids();
+    std::vector<int> info_vec(group_size);
+    for (auto const &e : be.exceptions()) {
+      try {
+        std::rethrow_exception(e);
+      } catch (oneapi::mkl::lapack::exception &e) {
+        std::cerr << "Exception " << ids[i] << std::endl
+                  << "reason: " << e.what() << std::endl
+                  << "info: " << e.info() << std::endl;
+        info_vec[i] = e.info();
+        i++;
+      }
+    }
+    queue.memcpy(info, info_vec.data(), group_size * sizeof(int)).wait();
+    std::free(matrix_info);
+    if (scratchpad)
+      sycl::free(scratchpad, queue);
+    return 1;
+  } catch (sycl::exception const &e) {
+    std::cerr << "Caught synchronous SYCL exception:" << std::endl
+              << "reason: " << e.what() << std::endl;
+    queue.memset(info, 0, group_size * sizeof(int)).wait();
+    std::free(matrix_info);
+    if (scratchpad)
+      sycl::free(scratchpad, queue);
+    return 1;
+  }
+  queue.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(e);
+    cgh.host_task([=] {
+      std::free(matrix_info);
+      sycl::free(scratchpad, queue);
+    });
+  });
+  queue.memset(info, 0, group_size * sizeof(int));
+  return 0;
+#endif
+}
+/// Solves a batch of systems of linear equations with a Cholesky-factored
+/// symmetric (Hermitian) positive-definite coefficient matrices.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] queue Device queue where calculations will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] nrhs The number of right-hand sides.
+/// \param [in,out] a Array of pointers to matrix A.
+/// \param [in] lda The leading dimension of matrix A.
+/// \param [in,out] b Array of pointers to matrix B.
+/// \param [in] ldb The leading dimension of matrix B.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+/// \param [in] group_size The batch size.
+template <typename T>
+inline int potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, int n,
+                       int nrhs, T *a[], int lda, T *b[], int ldb, int *info,
+                       int group_size) {
+#ifdef DPCT_USM_LEVEL_NONE
+  throw std::runtime_error("this API is unsupported when USM level is none");
+#else
+  using Ty = typename DataType<T>::T2;
+  struct matrix_info_t {
+    oneapi::mkl::uplo uplo_info;
+    std::int64_t n_info;
+    std::int64_t nrhs_info;
+    std::int64_t lda_info;
+    std::int64_t ldb_info;
+    std::int64_t group_size_info;
+  };
+  matrix_info_t *matrix_info =
+      (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
+  matrix_info->uplo_info = uplo;
+  matrix_info->n_info = n;
+  matrix_info->nrhs_info = nrhs;
+  matrix_info->lda_info = lda;
+  matrix_info->ldb_info = ldb;
+  matrix_info->group_size_info = group_size;
+  std::int64_t scratchpad_size = 0;
+  sycl::event e;
+  Ty *scratchpad = nullptr;
+  try {
+    scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<Ty>(
+        queue, &(matrix_info->uplo_info), &(matrix_info->n_info),
+        &(matrix_info->nrhs_info), &(matrix_info->lda_info),
+        &(matrix_info->ldb_info), 1, &(matrix_info->group_size_info));
+    scratchpad = sycl::malloc_device<Ty>(scratchpad_size, queue);
+    e = oneapi::mkl::lapack::potrs_batch(
+        queue, &(matrix_info->uplo_info), &(matrix_info->n_info),
+        &(matrix_info->nrhs_info), (Ty **)a, &(matrix_info->lda_info), (Ty **)b,
+        &(matrix_info->ldb_info), 1, &(matrix_info->group_size_info),
+        scratchpad, scratchpad_size);
+  } catch (oneapi::mkl::lapack::batch_error const &be) {
+    std::cerr << "Unexpected exception caught during call to LAPACK API: "
+                 "potrs_batch_scratchpad_size/potrs_batch"
+              << std::endl
+              << "reason: " << be.what() << std::endl
+              << "number: " << be.info() << std::endl;
+    int i = 0;
+    auto &ids = be.ids();
+    std::vector<int> info_vec(group_size);
+    for (auto const &e : be.exceptions()) {
+      try {
+        std::rethrow_exception(e);
+      } catch (oneapi::mkl::lapack::exception &e) {
+        std::cerr << "Exception " << ids[i] << std::endl
+                  << "reason: " << e.what() << std::endl
+                  << "info: " << e.info() << std::endl;
+        info_vec[i] = e.info();
+        i++;
+      }
+    }
+    queue.memcpy(info, info_vec.data(), group_size * sizeof(int)).wait();
+    std::free(matrix_info);
+    if (scratchpad)
+      sycl::free(scratchpad, queue);
+    return 1;
+  } catch (sycl::exception const &e) {
+    std::cerr << "Caught synchronous SYCL exception:" << std::endl
+              << "reason: " << e.what() << std::endl;
+    queue.memset(info, 0, group_size * sizeof(int)).wait();
+    std::free(matrix_info);
+    if (scratchpad)
+      sycl::free(scratchpad, queue);
+    return 1;
+  }
+  queue.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(e);
+    cgh.host_task([=] {
+      std::free(matrix_info);
+      sycl::free(scratchpad, queue);
+    });
+  });
+  queue.memset(info, 0, group_size * sizeof(int));
+  return 0;
+#endif
+}
+
+namespace detail {
+template <template <typename> typename functor_t, typename... args_t>
+inline int lapack_shim(sycl::queue &q, library_data_t a_type, int *info,
+                       std::string const &lapack_api_name, args_t &&...args) {
+  auto handle_lapack_exception = [&](const oneapi::mkl::lapack::exception &e) {
+    std::cerr << "Unexpected exception caught during call to LAPACK API: "
+              << lapack_api_name << std::endl
+              << "reason: " << e.what() << std::endl
+              << "info: " << e.info() << std::endl
+              << "detail: " << e.detail() << std::endl;
+    if (e.info() < std::numeric_limits<int>::min() ||
+        e.info() > std::numeric_limits<int>::max()) {
+      throw std::runtime_error("e.info() exceeds the limit of int type");
+    }
+    int info_val = static_cast<int>(e.info());
+    if (info)
+      dpct::detail::dpct_memcpy(q, info, &info_val, sizeof(int),
+                                memcpy_direction::host_to_device)
+          .wait();
+    return 1;
+  };
+  try {
+    switch (a_type) {
+    case library_data_t::real_float: {
+      functor_t<float>()(std::forward<args_t>(args)...);
+      break;
+    }
+    case library_data_t::real_double: {
+      functor_t<double>()(std::forward<args_t>(args)...);
+      break;
+    }
+    case library_data_t::complex_float: {
+      functor_t<std::complex<float>>()(std::forward<args_t>(args)...);
+      break;
+    }
+    case library_data_t::complex_double: {
+      functor_t<std::complex<double>>()(std::forward<args_t>(args)...);
+      break;
+    }
+    default:
+      throw std::runtime_error("the data type is unsupported");
+    }
+  } catch (oneapi::mkl::lapack::batch_error const &be) {
+    try {
+      std::rethrow_exception(be.exceptions()[0]);
+    } catch (oneapi::mkl::lapack::exception &e) {
+      return handle_lapack_exception(e);
+    }
+  } catch (oneapi::mkl::lapack::exception const &e) {
+    return handle_lapack_exception(e);
+  } catch (sycl::exception const &e) {
+    std::cerr << "Caught synchronous SYCL exception:" << std::endl
+              << "reason: " << e.what() << std::endl;
+    if (info)
+      dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int)).wait();
+    return 1;
+  }
+  return 0;
+}
+
+template <typename T> class working_memory {
+public:
+  working_memory(std::size_t element_number, const sycl::queue &q) : _q(q) {
+    _ptr = dpct::detail::dpct_malloc(element_number * sizeof(T), _q);
+  }
+  auto get_memory() {
+    return dpct::detail::get_memory<T>(_ptr);
+  }
+  auto get_ptr() {
+    return _ptr;
+  }
+  void set_event(sycl::event e) { _e = e; }
+  ~working_memory() {
+    if (_ptr) {
+      dpct::async_dpct_free({_ptr}, {_e}, _q);
+    }
+  }
+
+private:
+  void *_ptr = nullptr;
+  sycl::event _e;
+  sycl::queue _q;
+};
+
+std::size_t byte_to_element_number(std::size_t size_in_byte,
+                                   dpct::library_data_t element_type) {
+  auto dv = std::lldiv(
+      size_in_byte,
+      dpct::detail::library_data_size[static_cast<unsigned int>(element_type)] /
+          8);
+  if (dv.rem) {
+    throw std::runtime_error(
+        "size_in_byte is not divisible by the size of element (in bytes)");
+  }
+  return dv.quot;
+}
+std::size_t element_number_to_byte(std::size_t size_in_element,
+                                   dpct::library_data_t element_type) {
+  auto dv = std::lldiv(
+      dpct::detail::library_data_size[static_cast<unsigned int>(element_type)],
+      8);
+  if (dv.rem) {
+    throw std::runtime_error(
+        "the size of element (in bits) is not divisible by 8");
+  }
+  return size_in_element * dv.quot;
+}
+
+inline oneapi::mkl::jobsvd char2jobsvd(signed char job) {
+  switch (job) {
+  case 'A':
+    return oneapi::mkl::jobsvd::vectors;
+  case 'S':
+    return oneapi::mkl::jobsvd::somevec;
+  case 'O':
+    return oneapi::mkl::jobsvd::vectorsina;
+  case 'N':
+    return oneapi::mkl::jobsvd::novec;
+  default:
+    throw std::runtime_error("the job type is unsupported");
+  }
+}
+
+template <typename T> struct getrf_scratchpad_size_impl {
+  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
+                  library_data_t a_type, std::int64_t lda,
+                  std::size_t &device_ws_size) {
+    device_ws_size =
+        oneapi::mkl::lapack::getrf_scratchpad_size<T>(q, m, n, lda);
+  }
+};
+
+template <typename T> struct getrf_impl {
+  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
+                  library_data_t a_type, void *a, std::int64_t lda,
+                  std::int64_t *ipiv, void *device_ws,
+                  std::size_t device_ws_size, int *info) {
+    auto ipiv_data = dpct::detail::get_memory<std::int64_t>(ipiv);
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    oneapi::mkl::lapack::getrf(q, m, n, a_data, lda, ipiv_data, device_ws_data,
+                               device_ws_size);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+  }
+};
+
+template <typename T> struct getrs_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::transpose trans, std::int64_t n,
+                  std::int64_t nrhs, library_data_t a_type, void *a,
+                  std::int64_t lda, std::int64_t *ipiv, library_data_t b_type,
+                  void *b, std::int64_t ldb, int *info) {
+    auto ipiv_data = dpct::detail::get_memory<std::int64_t>(ipiv);
+    std::int64_t device_ws_size = oneapi::mkl::lapack::getrs_scratchpad_size<T>(
+        q, trans, n, nrhs, lda, ldb);
+    working_memory<T> device_ws(device_ws_size, q);
+    auto device_ws_data = device_ws.get_memory();
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto b_data = dpct::detail::get_memory<T>(b);
+    oneapi::mkl::lapack::getrs(q, trans, n, nrhs, a_data, lda, ipiv_data,
+                               b_data, ldb, device_ws_data, device_ws_size);
+    sycl::event e = dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+    device_ws.set_event(e);
+  }
+};
+
+template <typename T> struct geqrf_scratchpad_size_impl {
+  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
+                  library_data_t a_type, std::int64_t lda,
+                  std::size_t &device_ws_size) {
+    device_ws_size =
+        oneapi::mkl::lapack::geqrf_scratchpad_size<T>(q, m, n, lda);
+  }
+};
+
+template <typename T> struct geqrf_impl {
+  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
+                  library_data_t a_type, void *a, std::int64_t lda,
+                  library_data_t tau_type, void *tau, void *device_ws,
+                  std::size_t device_ws_size, int *info) {
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto tau_data = dpct::detail::get_memory<T>(tau);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    oneapi::mkl::lapack::geqrf(q, m, n, a_data, lda, tau_data, device_ws_data,
+                               device_ws_size);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+  }
+};
+
+template <typename T> struct getrfnp_impl {
+  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
+                  library_data_t a_type, void *a, std::int64_t lda,
+                  std::int64_t *ipiv, void *device_ws,
+                  std::size_t device_ws_size, int *info) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    std::int64_t a_stride = m * lda;
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    oneapi::mkl::lapack::getrfnp_batch(q, m, n, a_data, lda, a_stride, 1,
+                                       device_ws_data, device_ws_size);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+#endif
+  }
+};
+
+template <typename T> struct gesvd_scratchpad_size_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::jobsvd jobu,
+                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
+                  library_data_t a_type, std::int64_t lda,
+                  library_data_t u_type, std::int64_t ldu,
+                  library_data_t vt_type, std::int64_t ldvt,
+                  std::size_t &device_ws_size) {
+    device_ws_size = oneapi::mkl::lapack::gesvd_scratchpad_size<T>(
+        q, jobu, jobvt, m, n, lda, ldu, ldvt);
+  }
+};
+
+template <typename T> struct ElementType {
+  using value_tpye = T;
+};
+template <typename T> struct ElementType<std::complex<T>> {
+  using value_tpye = T;
+};
+template <typename T> struct gesvd_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::jobsvd jobu,
+                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
+                  library_data_t a_type, void *a, std::int64_t lda,
+                  library_data_t s_type, void *s, library_data_t u_type,
+                  void *u, std::int64_t ldu, library_data_t vt_type, void *vt,
+                  std::int64_t ldvt, void *device_ws,
+                  std::size_t device_ws_size, int *info) {
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto s_data =
+        dpct::detail::get_memory<typename ElementType<T>::value_tpye>(s);
+    auto u_data = dpct::detail::get_memory<T>(u);
+    auto vt_data = dpct::detail::get_memory<T>(vt);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    oneapi::mkl::lapack::gesvd(q, jobu, jobvt, m, n, a_data, lda, s_data,
+                               u_data, ldu, vt_data, ldvt, device_ws_data,
+                               device_ws_size);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+  }
+};
+template <typename T> struct gesvd_conj_impl : public gesvd_impl<T> {
+  void operator()(sycl::queue &q, oneapi::mkl::jobsvd jobu,
+                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
+                  library_data_t a_type, void *a, std::int64_t lda,
+                  library_data_t s_type, void *s, library_data_t u_type,
+                  void *u, std::int64_t ldu, library_data_t vt_type, void *vt,
+                  std::int64_t ldvt, void *device_ws,
+                  std::size_t device_ws_size, int *info) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    using base = gesvd_impl<T>;
+    base::operator()(q, jobu, jobvt, m, n, a_type, a, lda, s_type, s, u_type, u,
+                     ldu, vt_type, vt, ldvt, device_ws, device_ws_size, info);
+    auto vt_data = dpct::detail::get_memory<T>(vt);
+    oneapi::mkl::blas::row_major::imatcopy(q, oneapi::mkl::transpose::conjtrans,
+                                           n, n, T(1.0f), vt_data, ldvt, ldvt);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+#endif
+  }
+};
+
+template <typename T> struct potrf_scratchpad_size_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
+                  library_data_t a_type, std::int64_t lda,
+                  std::size_t &device_ws_size) {
+    device_ws_size =
+        oneapi::mkl::lapack::potrf_scratchpad_size<T>(q, uplo, n, lda);
+  }
+};
+
+template <typename T> struct potrf_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
+                  library_data_t a_type, void *a, std::int64_t lda,
+                  void *device_ws, std::size_t device_ws_size, int *info) {
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    oneapi::mkl::lapack::potrf(q, uplo, n, a_data, lda, device_ws_data,
+                               device_ws_size);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+  }
+};
+
+template <typename T> struct potrs_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
+                  std::int64_t nrhs, library_data_t a_type, void *a,
+                  std::int64_t lda, library_data_t b_type, void *b,
+                  std::int64_t ldb, int *info) {
+    std::int64_t device_ws_size = oneapi::mkl::lapack::potrs_scratchpad_size<T>(
+        q, uplo, n, nrhs, lda, ldb);
+    working_memory<T> device_ws(device_ws_size, q);
+    auto device_ws_data = device_ws.get_memory();
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto b_data = dpct::detail::get_memory<T>(b);
+    oneapi::mkl::lapack::potrs(q, uplo, n, nrhs, a_data, lda, b_data, ldb,
+                               device_ws_data, device_ws_size);
+    sycl::event e = dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+    device_ws.set_event(e);
+  }
+};
+
+template <typename T> struct value_type_trait {
+  using value_type = T;
+};
+template <typename T> struct value_type_trait<std::complex<T>> {
+  using value_type = T;
+};
+
+template <typename T> auto lamch_s() {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+  if constexpr (std::is_same_v<T, float>) {
+    return slamch("S");
+  } else if constexpr (std::is_same_v<T, double>) {
+    return dlamch("S");
+  }
+  throw std::runtime_error("the type is unsupported");
+#endif
+}
+
+#define DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(FUNC, ...)                          \
+  do {                                                                         \
+    if constexpr (std::is_floating_point_v<T>) {                               \
+      device_ws_size = oneapi::mkl::lapack::sy##FUNC(__VA_ARGS__);             \
+    } else {                                                                   \
+      device_ws_size = oneapi::mkl::lapack::he##FUNC(__VA_ARGS__);             \
+    }                                                                          \
+  } while (0)
+
+#define DISPATCH_FLOAT_FOR_CALCULATION(FUNC, ...)                              \
+  do {                                                                         \
+    if constexpr (std::is_floating_point_v<T>) {                               \
+      oneapi::mkl::lapack::sy##FUNC(__VA_ARGS__);                              \
+    } else {                                                                   \
+      oneapi::mkl::lapack::he##FUNC(__VA_ARGS__);                              \
+    }                                                                          \
+  } while (0)
+
+template <typename T> struct syheevx_scratchpad_size_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::compz jobz,
+                  oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
+                  std::int64_t n, std::int64_t lda, void *vl, void *vu,
+                  std::int64_t il, std::int64_t iu,
+                  std::size_t &device_ws_size) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    using value_t = typename value_type_trait<T>::value_type;
+    auto vl_value = *reinterpret_cast<value_t *>(vl);
+    auto vu_value = *reinterpret_cast<value_t *>(vu);
+    auto abstol = 2 * lamch_s<value_t>();
+    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(evx_scratchpad_size<T>, q, jobz, range,
+                                       uplo, n, lda, vl_value, vu_value, il, iu,
+                                       abstol, lda);
+#endif
+  }
+};
+
+template <typename T> constexpr library_data_t get_library_data_t_from_type() {
+  if constexpr (std::is_same_v<T, float>) {
+    return library_data_t::real_float;
+  } else if constexpr (std::is_same_v<T, double>) {
+    return library_data_t::real_double;
+  } else if constexpr (std::is_same_v<T, sycl::float2> ||
+                       std::is_same_v<T, std::complex<float>>) {
+    return library_data_t::complex_float;
+  } else if constexpr (std::is_same_v<T, sycl::double2> ||
+                       std::is_same_v<T, std::complex<double>>) {
+    return library_data_t::complex_double;
+  }
+  throw std::runtime_error("the type is unsupported");
+}
+
+template <typename T> struct syheevx_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::compz jobz,
+                  oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
+                  std::int64_t n, library_data_t a_type, void *a,
+                  std::int64_t lda, void *vl, void *vu, std::int64_t il,
+                  std::int64_t iu, std::int64_t *m, library_data_t w_type,
+                  void *w, void *device_ws, std::size_t device_ws_size,
+                  int *info) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    using value_t = typename value_type_trait<T>::value_type;
+    working_memory<T> z(n * lda, q);
+    working_memory<std::int64_t> m_device(1, q);
+    auto z_data = z.get_memory();
+    auto m_device_data = m_device.get_memory();
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    auto vl_value = *reinterpret_cast<value_t *>(vl);
+    auto vu_value = *reinterpret_cast<value_t *>(vu);
+    auto w_data = dpct::detail::get_memory<value_t>(w);
+    auto abstol = 2 * lamch_s<value_t>();
+    DISPATCH_FLOAT_FOR_CALCULATION(evx, q, jobz, range, uplo, n, a_data, lda,
+                                   vl_value, vu_value, il, iu, abstol,
+                                   m_device_data, w_data, z_data, lda,
+                                   device_ws_data, device_ws_size);
+    dpct::async_dpct_memcpy(a, z.get_ptr(), n * lda * sizeof(T),
+                            memcpy_direction::device_to_device, q);
+    dpct::async_dpct_memcpy(m, m_device.get_ptr(), sizeof(std::int64_t),
+                            memcpy_direction::device_to_host, q);
+    sycl::event e = dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+    z.set_event(e);
+    m_device.set_event(e);
+#endif
+  }
+};
+
+template <typename T> struct syhegvx_scratchpad_size_impl {
+  void operator()(sycl::queue &q, std::int64_t itype, oneapi::mkl::compz jobz,
+                  oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
+                  std::int64_t n, std::int64_t lda, std::int64_t ldb, void *vl,
+                  void *vu, std::int64_t il, std::int64_t iu,
+                  std::size_t &device_ws_size) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    using value_t = typename value_type_trait<T>::value_type;
+    auto vl_value = *reinterpret_cast<value_t *>(vl);
+    auto vu_value = *reinterpret_cast<value_t *>(vu);
+    auto abstol = 2 * lamch_s<value_t>();
+    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(gvx_scratchpad_size<T>, q, itype, jobz,
+                                       range, uplo, n, lda, ldb, vl_value,
+                                       vu_value, il, iu, abstol, lda);
+#endif
+  }
+};
+
+template <typename T> struct syhegvx_impl {
+  void operator()(sycl::queue &q, std::int64_t itype, oneapi::mkl::compz jobz,
+                  oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
+                  std::int64_t n, void *a, std::int64_t lda, void *b,
+                  std::int64_t ldb, void *vl, void *vu, std::int64_t il,
+                  std::int64_t iu, std::int64_t *m, void *w, void *device_ws,
+                  std::size_t device_ws_size, int *info) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    using value_t = typename value_type_trait<T>::value_type;
+    working_memory<T> z(n * lda, q);
+    working_memory<std::int64_t> m_device(1, q);
+    auto z_data = z.get_memory();
+    auto m_device_data = m_device.get_memory();
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto b_data = dpct::detail::get_memory<T>(b);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    auto vl_value = *reinterpret_cast<value_t *>(vl);
+    auto vu_value = *reinterpret_cast<value_t *>(vu);
+    auto w_data = dpct::detail::get_memory<value_t>(w);
+    auto abstol = 2 * lamch_s<value_t>();
+    DISPATCH_FLOAT_FOR_CALCULATION(gvx, q, itype, jobz, range, uplo, n, a_data,
+                                   lda, b_data, ldb, vl_value, vu_value, il, iu,
+                                   abstol, m_device_data, w_data, z_data, lda,
+                                   device_ws_data, device_ws_size);
+    dpct::async_dpct_memcpy(a, z.get_ptr(), n * lda * sizeof(T),
+                            memcpy_direction::device_to_device, q);
+    dpct::async_dpct_memcpy(m, m_device.get_ptr(), sizeof(std::int64_t),
+                            memcpy_direction::device_to_host, q);
+    sycl::event e = dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+    z.set_event(e);
+    m_device.set_event(e);
+#endif
+  }
+};
+
+template <typename T> struct syhegvd_scratchpad_size_impl {
+  void operator()(sycl::queue &q, std::int64_t itype, oneapi::mkl::job jobz,
+                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
+                  std::int64_t ldb, std::size_t &device_ws_size) {
+    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(gvd_scratchpad_size<T>, q, itype, jobz,
+                                       uplo, n, lda, ldb);
+  }
+};
+
+template <typename T> struct syhegvd_impl {
+  void operator()(sycl::queue &q, std::int64_t itype, oneapi::mkl::job jobz,
+                  oneapi::mkl::uplo uplo, std::int64_t n, void *a,
+                  std::int64_t lda, void *b, std::int64_t ldb, void *w,
+                  void *device_ws, std::size_t device_ws_size,
+                  int *info) {
+    using value_t = typename value_type_trait<T>::value_type;
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto b_data = dpct::detail::get_memory<T>(b);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    auto w_data = dpct::detail::get_memory<value_t>(w);
+    DISPATCH_FLOAT_FOR_CALCULATION(gvd, q, itype, jobz, uplo, n, a_data, lda,
+                                   b_data, ldb, w_data, device_ws_data,
+                                   device_ws_size);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+  }
+};
+
+oneapi::mkl::compz job2compz(const oneapi::mkl::job &job) {
+  oneapi::mkl::compz ret;
+  if (job == oneapi::mkl::job::novec) {
+    ret = oneapi::mkl::compz::novectors;
+  } else if (job == oneapi::mkl::job::vec) {
+    ret = oneapi::mkl::compz::vectors;
+  } else {
+    throw std::runtime_error("the job type is unsupported");
+  }
+  return ret;
+}
+
+template <typename T> struct syheev_scratchpad_size_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::compz jobz,
+                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
+                  std::size_t &device_ws_size) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(ev_scratchpad_size<T>, q, jobz, uplo, n,
+                                       lda);
+#endif
+  }
+};
+
+template <typename T> struct syheev_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::compz jobz,
+                  oneapi::mkl::uplo uplo, std::int64_t n, void *a,
+                  std::int64_t lda, void *w, void *device_ws,
+                  std::size_t device_ws_size, int *info) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    using value_t = typename value_type_trait<T>::value_type;
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    auto w_data = dpct::detail::get_memory<value_t>(w);
+    DISPATCH_FLOAT_FOR_CALCULATION(ev, q, jobz, uplo, n, a_data, lda, w_data,
+                                   device_ws_data, device_ws_size);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+#endif
+  }
+};
+
+template <typename T> struct syheevd_scratchpad_size_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
+                  std::int64_t n, library_data_t a_type, std::int64_t lda,
+                  std::size_t &device_ws_size) {
+    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(evd_scratchpad_size<T>, q, jobz, uplo, n,
+                                       lda);
+  }
+};
+
+template <typename T> struct syheevd_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
+                  std::int64_t n, library_data_t a_type, void *a,
+                  std::int64_t lda, void *w, void *device_ws,
+                  std::size_t device_ws_size, int *info) {
+    using value_t = typename value_type_trait<T>::value_type;
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    auto w_data = dpct::detail::get_memory<value_t>(w);
+    DISPATCH_FLOAT_FOR_CALCULATION(evd, q, jobz, uplo, n, a_data, lda, w_data,
+                                   device_ws_data, device_ws_size);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+  }
+};
+
+#undef DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE
+#undef DISPATCH_FLOAT_FOR_CALCULATION
+
+template <typename T> struct trtri_scratchpad_size_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo,
+                  oneapi::mkl::diag diag, std::int64_t n, library_data_t a_type,
+                  std::int64_t lda, std::size_t &device_ws_size) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    device_ws_size =
+        oneapi::mkl::lapack::trtri_scratchpad_size<T>(q, uplo, diag, n, lda);
+#endif
+  }
+};
+
+template <typename T> struct trtri_impl {
+  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo,
+                  oneapi::mkl::diag diag, std::int64_t n, library_data_t a_type,
+                  void *a, std::int64_t lda, void *device_ws,
+                  std::size_t device_ws_size, int *info) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    auto a_data = dpct::detail::get_memory<T>(a);
+    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
+    oneapi::mkl::lapack::trtri(q, uplo, diag, n, a_data, lda, device_ws_data,
+                               device_ws_size);
+    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
+#endif
+  }
+};
+} // namespace detail
+
+/// Computes the size of workspace memory of getrf function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] m The number of rows in the matrix A.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [out] device_ws_size The workspace size in bytes.
+/// \param [out] host_ws_size The host workspace size in bytes. Currently the
+/// value is always zero.
+inline int getrf_scratchpad_size(sycl::queue &q, std::int64_t m, std::int64_t n,
+                                 library_data_t a_type, std::int64_t lda,
+                                 std::size_t *device_ws_size,
+                                 std::size_t *host_ws_size = nullptr) {
+  if (host_ws_size)
+    *host_ws_size = 0;
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::getrf_scratchpad_size_impl>(
+      q, a_type, nullptr, "getrf_scratchpad_size", q, m, n, a_type, lda,
+      device_ws_size_tmp);
+  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
+  return ret;
+}
+
+/// Computes the LU factorization of a general m-by-n matrix.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] m The number of rows in the matrix A.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A. Overwritten by L and U. The unit
+/// diagonal elements of L are not stored.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [out] ipiv The pivot indices. If \p ipiv is nullptr, non-pivoting
+/// LU factorization is computed.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The workspace size in bytes.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int getrf(sycl::queue &q, std::int64_t m, std::int64_t n,
+                 library_data_t a_type, void *a, std::int64_t lda,
+                 std::int64_t *ipiv, void *device_ws,
+                 std::size_t device_ws_size, int *info) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+  std::size_t device_ws_size_in_element_number =
+      detail::byte_to_element_number(device_ws_size, a_type);
+  if (ipiv == nullptr) {
+    return detail::lapack_shim<detail::getrfnp_impl>(
+        q, a_type, info, "getrfnp_batch", q, m, n, a_type, a, lda, ipiv,
+        device_ws, device_ws_size_in_element_number, info);
+  }
+  return detail::lapack_shim<detail::getrf_impl>(
+      q, a_type, info, "getrf", q, m, n, a_type, a, lda, ipiv, device_ws,
+      device_ws_size_in_element_number, info);
+#endif
+}
+
+/// Solves a system of linear equations with a LU-factored square coefficient
+/// matrix, with multiple right-hand sides.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] trans Indicates the form of the linear equation.
+/// \param [in] n The order of the matrix A and the number of rows in matrix B.
+/// \param [in] nrhs The number of right hand sides.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] a The input matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] ipiv The pivot indices.
+/// \param [in] b_type The data type of the matrix B.
+/// \param [in, out] b The matrix B, whose columns are the right-hand sides
+/// for the systems of equations.
+/// \param [in] ldb The leading dimension of the matrix B.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int getrs(sycl::queue &q, oneapi::mkl::transpose trans, std::int64_t n,
+                 std::int64_t nrhs, library_data_t a_type, void *a,
+                 std::int64_t lda, std::int64_t *ipiv, library_data_t b_type,
+                 void *b, std::int64_t ldb, int *info) {
+  return detail::lapack_shim<detail::getrs_impl>(
+      q, a_type, info, "getrs_scratchpad_size/getrs", q, trans, n, nrhs, a_type,
+      a, lda, ipiv, b_type, b, ldb, info);
+}
+
+/// Computes the size of workspace memory of geqrf function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] m The number of rows in the matrix A.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [out] device_ws_size The device workspace size in bytes.
+/// \param [out] host_ws_size The host workspace size in bytes. Currently the
+/// value is always zero.
+inline int geqrf_scratchpad_size(sycl::queue &q, std::int64_t m, std::int64_t n,
+                                 library_data_t a_type, std::int64_t lda,
+                                 std::size_t *device_ws_size,
+                                 std::size_t *host_ws_size = nullptr) {
+  if (host_ws_size)
+    *host_ws_size = 0;
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::geqrf_scratchpad_size_impl>(
+      q, a_type, nullptr, "geqrf_scratchpad_size", q, m, n, a_type, lda,
+      device_ws_size_tmp);
+  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
+  return ret;
+}
+
+/// Computes the QR factorization of a general m-by-n matrix.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] m The number of rows in the matrix A.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A. Overwritten by the factorization data.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] tau_type The data type of the array tau.
+/// \param [in] tau The array contains scalars that define elementary reflectors
+/// for the matrix Q in its decomposition in a product of elementary reflectors.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The workspace size in bytes.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int geqrf(sycl::queue &q, std::int64_t m, std::int64_t n,
+                 library_data_t a_type, void *a, std::int64_t lda,
+                 library_data_t tau_type, void *tau, void *device_ws,
+                 std::size_t device_ws_size, int *info) {
+  std::size_t device_ws_size_in_element_number =
+      detail::byte_to_element_number(device_ws_size, a_type);
+  return detail::lapack_shim<detail::geqrf_impl>(
+      q, a_type, info, "geqrf", q, m, n, a_type, a, lda, tau_type, tau,
+      device_ws, device_ws_size_in_element_number, info);
+}
+
+/// Computes the size of workspace memory of gesvd function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobu Must be 'A' (representing jobsvd::vectors), 'S'
+/// (representing jobsvd::somevec), 'O' (representing jobsvd::vectorsina) or 'N'
+/// (representing jobsvd::novec).
+/// \param [in] jobvt Must be 'A' (representing jobsvd::vectors), 'S'
+/// (representing jobsvd::somevec), 'O' (representing jobsvd::vectorsina) or 'N'
+/// (representing jobsvd::novec).
+/// \param [in] m The number of rows in the matrix A.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] u_type The data type of the matrix U.
+/// \param [in] ldu The leading dimension of the matrix U.
+/// \param [in] vt_type The data type of the matrix VT.
+/// \param [in] ldvt The leading dimension of the matrix VT.
+/// \param [out] device_ws_size The device workspace size in bytes.
+/// \param [out] host_ws_size The host workspace size in bytes. Currently the
+/// value is always zero.
+inline int gesvd_scratchpad_size(sycl::queue &q, signed char jobu,
+                                 signed char jobvt, std::int64_t m,
+                                 std::int64_t n, library_data_t a_type,
+                                 std::int64_t lda, library_data_t u_type,
+                                 std::int64_t ldu, library_data_t vt_type,
+                                 std::int64_t ldvt, std::size_t *device_ws_size,
+                                 std::size_t *host_ws_size = nullptr) {
+  oneapi::mkl::jobsvd jobu_enum = detail::char2jobsvd(jobu);
+  oneapi::mkl::jobsvd jobvt_enum = detail::char2jobsvd(jobvt);
+  if (host_ws_size)
+    *host_ws_size = 0;
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::gesvd_scratchpad_size_impl>(
+      q, a_type, nullptr, "gesvd_scratchpad_size", q, jobu_enum, jobvt_enum, m,
+      n, a_type, lda, u_type, ldu, vt_type, ldvt, device_ws_size_tmp);
+  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
+  return ret;
+}
+
+/// Computes the size of workspace memory of gesvd function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::vec or job::novec
+/// \param [in] all_vec Only have effects when \param jobz is job::vec.If the
+/// value is zero, all m columns of U are returned in the matrix U, otherwise
+/// the first min( \param m, \param n ) columns of U (the left singular vectors)
+/// are returned in the matrix U.
+/// \param [in] m The number of rows in the matrix A.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] u_type The data type of the matrix U.
+/// \param [in] ldu The leading dimension of the matrix U.
+/// \param [in] vt_type The data type of the matrix VT.
+/// \param [in] ldvt The leading dimension of the matrix VT.
+/// \param [out] device_ws_size The device workspace size as a number of
+/// elements of type \param a_type.
+/// \param [out] host_ws_size The host workspace size as a number of elements
+/// of type \param a_type. Currently the value is always zero.
+inline int gesvd_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
+                                 std::int64_t all_vec, std::int64_t m,
+                                 std::int64_t n, library_data_t a_type,
+                                 std::int64_t lda, library_data_t u_type,
+                                 std::int64_t ldu, library_data_t vt_type,
+                                 std::int64_t ldvt, int *device_ws_size,
+                                 std::size_t *host_ws_size = nullptr) {
+  if (host_ws_size)
+    *host_ws_size = 0;
+  oneapi::mkl::jobsvd jobu;
+  oneapi::mkl::jobsvd jobvt;
+  if (jobz == oneapi::mkl::job::vec) {
+    if (all_vec) {
+      jobu = jobvt = oneapi::mkl::jobsvd::somevec;
+    } else {
+      jobu = jobvt = oneapi::mkl::jobsvd::vectors;
+    }
+  } else if (jobz == oneapi::mkl::job::novec) {
+    jobu = jobvt = oneapi::mkl::jobsvd::novec;
+  } else {
+    throw std::runtime_error("the job type is unsupported");
+  }
+  std::size_t device_ws_size_64;
+  int ret = detail::lapack_shim<detail::gesvd_scratchpad_size_impl>(
+      q, a_type, nullptr, "gesvd_scratchpad_size", q, jobu, jobvt, m, n, a_type,
+      lda, u_type, ldu, vt_type, ldvt, device_ws_size_64);
+  *device_ws_size = device_ws_size_64;
+  return ret;
+}
+
+/// Computes the size of workspace memory of gesvd function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobu Must be 'A' (representing jobsvd::vectors), 'S'
+/// (representing jobsvd::somevec), 'O' (representing jobsvd::vectorsina) or 'N'
+/// (representing jobsvd::novec).
+/// \param [in] jobvt Must be 'A' (representing jobsvd::vectors), 'S'
+/// (representing jobsvd::somevec), 'O' (representing jobsvd::vectorsina) or 'N'
+/// (representing jobsvd::novec).
+/// \param [in] m The number of rows in the matrix A.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A and it will be overwritten according
+/// to \p jobu and \p jobvt.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] s_type The data type of the matrix S.
+/// \param [out] s The output matrix S.
+/// \param [in] u_type The data type of the matrix U.
+/// \param [out] u The output matrix U.
+/// \param [in] ldu The leading dimension of the matrix U.
+/// \param [in] vt_type The data type of the matrix VT.
+/// \param [out] vt The output matrix VT.
+/// \param [in] ldvt The leading dimension of the matrix VT.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The workspace size in bytes.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int gesvd(sycl::queue &q, signed char jobu, signed char jobvt,
+                 std::int64_t m, std::int64_t n, library_data_t a_type, void *a,
+                 std::int64_t lda, library_data_t s_type, void *s,
+                 library_data_t u_type, void *u, std::int64_t ldu,
+                 library_data_t vt_type, void *vt, std::int64_t ldvt,
+                 void *device_ws, std::size_t device_ws_size, int *info) {
+  oneapi::mkl::jobsvd jobu_enum = detail::char2jobsvd(jobu);
+  oneapi::mkl::jobsvd jobvt_enum = detail::char2jobsvd(jobvt);
+  std::size_t device_ws_size_in_element_number =
+      detail::byte_to_element_number(device_ws_size, a_type);
+  return detail::lapack_shim<detail::gesvd_impl>(
+      q, a_type, info, "gesvd", q, jobu_enum, jobvt_enum, m, n, a_type, a, lda,
+      s_type, s, u_type, u, ldu, vt_type, vt, ldvt, device_ws,
+      device_ws_size_in_element_number, info);
+}
+
+/// Computes the size of workspace memory of gesvd function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::vec or job::novec.
+/// \param [in] all_vec Only have effects when \param jobz is job::vec.If the
+/// value is zero, all m columns of U are returned in the matrix U, otherwise
+/// the first min( \param m, \param n ) columns of U (the left singular vectors)
+/// are returned in the matrix U.
+/// \param [in] m The number of rows in the matrix A.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A and it will be overwritten according
+/// to \p jobu and \p jobvt.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] s_type The data type of the matrix S.
+/// \param [out] s The output matrix S.
+/// \param [in] u_type The data type of the matrix U.
+/// \param [out] u The output matrix U.
+/// \param [in] ldu The leading dimension of the matrix U.
+/// \param [in] vt_type The data type of the matrix VT.
+/// \param [out] vt The output matrix VT.
+/// \param [in] ldvt The leading dimension of the matrix VT.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The device workspace size as a number of
+/// elements of type \param a_type.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int gesvd(sycl::queue &q, oneapi::mkl::job jobz, std::int64_t all_vec,
+                 std::int64_t m, std::int64_t n, library_data_t a_type, void *a,
+                 std::int64_t lda, library_data_t s_type, void *s,
+                 library_data_t u_type, void *u, std::int64_t ldu,
+                 library_data_t vt_type, void *vt, std::int64_t ldvt,
+                 void *device_ws, std::size_t device_ws_size, int *info) {
+  oneapi::mkl::jobsvd jobu;
+  oneapi::mkl::jobsvd jobvt;
+  if (jobz == oneapi::mkl::job::vec) {
+    if (all_vec) {
+      jobu = jobvt = oneapi::mkl::jobsvd::somevec;
+    } else {
+      jobu = jobvt = oneapi::mkl::jobsvd::vectors;
+    }
+  } else if (jobz == oneapi::mkl::job::novec) {
+    jobu = jobvt = oneapi::mkl::jobsvd::novec;
+  } else {
+    throw std::runtime_error("the job type is unsupported");
+  }
+
+  detail::lapack_shim<detail::gesvd_conj_impl>(
+      q, a_type, info, "gesvd", q, jobu, jobvt, m, n, a_type, a, lda, s_type, s,
+      u_type, u, ldu, vt_type, vt, ldvt, device_ws, device_ws_size, info);
+  return 0;
+}
+
+/// Computes the size of workspace memory of potrf function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [out] device_ws_size The device workspace size in bytes.
+/// \param [out] host_ws_size The host workspace size in bytes. Currently the
+/// value is always zero.
+inline int potrf_scratchpad_size(sycl::queue &q, oneapi::mkl::uplo uplo,
+                                 std::int64_t n, library_data_t a_type,
+                                 std::int64_t lda, std::size_t *device_ws_size,
+                                 std::size_t *host_ws_size = nullptr) {
+  if (host_ws_size)
+    *host_ws_size = 0;
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::potrf_scratchpad_size_impl>(
+      q, a_type, nullptr, "potrf_scratchpad_size", q, uplo, n, a_type, lda,
+      device_ws_size_tmp);
+  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
+  return ret;
+}
+
+/// Computes the Cholesky factorization of a symmetric (Hermitian)
+/// positive-definite matrix.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The number of columns in the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A. Overwritten by the Cholesky factor U
+/// or L, as specified by \p uplo.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The workspace size in bytes.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int potrf(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
+                 library_data_t a_type, void *a, std::int64_t lda,
+                 void *device_ws, std::size_t device_ws_size, int *info) {
+  std::size_t device_ws_size_in_element_number =
+      detail::byte_to_element_number(device_ws_size, a_type);
+  return detail::lapack_shim<detail::potrf_impl>(
+      q, a_type, info, "potrf", q, uplo, n, a_type, a, lda, device_ws,
+      device_ws_size_in_element_number, info);
+}
+
+/// Solves a system of linear equations with a Cholesky-factored symmetric
+/// (Hermitian) positive-definite coefficient matrix.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A and the number of rows in matrix B.
+/// \param [in] nrhs The number of right hand sides.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A. Overwritten by the Cholesky factor U
+/// or L, as specified by \p uplo.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] b_type The data type of the matrix B.
+/// \param [in, out] b The matrix B, whose columns are the right-hand sides
+/// for the systems of equations.
+/// \param [in] ldb The leading dimension of the matrix B.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int potrs(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
+                 std::int64_t nrhs, library_data_t a_type, void *a,
+                 std::int64_t lda, library_data_t b_type, void *b,
+                 std::int64_t ldb, int *info) {
+  return detail::lapack_shim<detail::potrs_impl>(
+      q, a_type, info, "potrs_scratchpad_size/potrs", q, uplo, n, nrhs, a_type,
+      a, lda, b_type, b, ldb, info);
+}
+
+/// Computes the size of workspace memory of syevx/heevx function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] vl If range == rangev::values, the lower bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] vu If range == rangev::values, the upper bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] il If range == rangev::indices, the indices of the smallest
+/// eigenvalue to be returned.
+/// \param [in] iu If range == rangev::indices, the indices of the largest
+/// eigenvalue to be returned.
+/// \param [in] w_type The data type of the eigenvalues.
+/// \param [out] device_ws_size The device workspace size in bytes.
+/// \param [out] host_ws_size The host workspace size in bytes. Currently the
+/// value is always zero.
+inline int syheevx_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
+                                   oneapi::mkl::rangev range,
+                                   oneapi::mkl::uplo uplo, std::int64_t n,
+                                   library_data_t a_type, std::int64_t lda,
+                                   void *vl, void *vu, std::int64_t il,
+                                   std::int64_t iu, library_data_t w_type,
+                                   std::size_t *device_ws_size,
+                                   std::size_t *host_ws_size = nullptr) {
+  if (host_ws_size)
+    *host_ws_size = 0;
+  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::syheevx_scratchpad_size_impl>(
+      q, a_type, nullptr, "syevx_scratchpad_size/heevx_scratchpad_size", q,
+      compz_jobz, range, uplo, n, lda, vl, vu, il, iu,
+      device_ws_size_tmp);
+  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
+  return ret;
+}
+
+/// Computes selected eigenvalues and, optionally, eigenvectors of a
+/// symmetric/Hermitian matrix.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A. On exit, the lower or upper triangle is
+/// overwritten.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] vl If range == rangev::values, the lower bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] vu If range == rangev::values, the upper bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] il If range == rangev::indices, the indices of the smallest
+/// eigenvalue to be returned.
+/// \param [in] iu If range == rangev::indices, the indices of the largest
+/// eigenvalue to be returned.
+/// \param [out] m The total number of eigenvalues found.
+/// \param [in] w_type The data type of the eigenvalues.
+/// \param [out] w The eigenvalues of the matrix A in ascending order.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The workspace size in bytes.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int syheevx(sycl::queue &q, oneapi::mkl::job jobz,
+                   oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
+                   std::int64_t n, library_data_t a_type, void *a,
+                   std::int64_t lda, void *vl, void *vu, std::int64_t il,
+                   std::int64_t iu, std::int64_t *m, library_data_t w_type,
+                   void *w, void *device_ws, std::size_t device_ws_size,
+                   int *info) {
+  std::size_t device_ws_size_in_element_number =
+      detail::byte_to_element_number(device_ws_size, a_type);
+  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
+  int ret = detail::lapack_shim<detail::syheevx_impl>(
+      q, a_type, info, "syevx/heevx", q, compz_jobz, range, uplo, n, a_type, a,
+      lda, vl, vu, il, iu, m, w_type, w, device_ws,
+      device_ws_size_in_element_number, info);
+  q.wait();
+  return ret;
+}
+
+/// Computes the size of workspace memory of syevx/heevx function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] vl If range == rangev::values, the lower bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] vu If range == rangev::values, the upper bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] il If range == rangev::indices, the indices of the smallest
+/// eigenvalue to be returned.
+/// \param [in] iu If range == rangev::indices, the indices of the largest
+/// eigenvalue to be returned.
+/// \param [out] device_ws_size The device workspace size as a number of
+/// elements of type \tparam T.
+template <typename T, typename ValueT>
+inline int syheevx_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
+                                   oneapi::mkl::rangev range,
+                                   oneapi::mkl::uplo uplo, int n, int lda,
+                                   ValueT vl, ValueT vu, int il, int iu,
+                                   int *device_ws_size) {
+  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::syheevx_scratchpad_size_impl>(
+      q, detail::get_library_data_t_from_type<T>(), nullptr,
+      "syevx_scratchpad_size/heevx_scratchpad_size", q, compz_jobz, range, uplo,
+      n, lda, &vl, &vu, il, iu, device_ws_size_tmp);
+  *device_ws_size = (int)device_ws_size_tmp;
+  return ret;
+}
+
+/// Computes selected eigenvalues and, optionally, eigenvectors of a
+/// symmetric/Hermitian matrix.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in, out] a The input matrix A. On exit, the lower or upper triangle is
+/// overwritten.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] vl If range == rangev::values, the lower bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] vu If range == rangev::values, the upper bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] il If range == rangev::indices, the indices of the smallest
+/// eigenvalue to be returned.
+/// \param [in] iu If range == rangev::indices, the indices of the largest
+/// eigenvalue to be returned.
+/// \param [out] m The total number of eigenvalues found.
+/// \param [out] w The eigenvalues of the matrix A in ascending order.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The device workspace size as a number of
+/// elements of type \tparam T.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+template <typename T, typename ValueT>
+inline int syheevx(sycl::queue &q, oneapi::mkl::job jobz,
+                   oneapi::mkl::rangev range, oneapi::mkl::uplo uplo, int n,
+                   T *a, int lda, ValueT vl, ValueT vu, int il, int iu, int *m,
+                   ValueT *w, T *device_ws, int device_ws_size, int *info) {
+  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
+  std::int64_t m64;
+  int ret = detail::lapack_shim<detail::syheevx_impl>(
+      q, detail::get_library_data_t_from_type<T>(), info, "syevx/heevx", q,
+      compz_jobz, range, uplo, n, detail::get_library_data_t_from_type<T>(), a,
+      lda, &vl, &vu, il, iu, &m64,
+      detail::get_library_data_t_from_type<ValueT>(), w, device_ws,
+      device_ws_size, info);
+  q.wait();
+  *m = (int)m64;
+  return ret;
+}
+
+/// Computes the size of workspace memory of sygvx/hegvx function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] itype Must be 1, 2 or 3.
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] ldb The leading dimension of the matrix B.
+/// \param [in] vl If range == rangev::values, the lower bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] vu If range == rangev::values, the upper bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] il If range == rangev::indices, the indices of the smallest
+/// eigenvalue to be returned.
+/// \param [in] iu If range == rangev::indices, the indices of the largest
+/// eigenvalue to be returned.
+/// \param [in] device_ws_size The device workspace size as a number of
+/// elements of type \tparam T.
+template <typename T, typename ValueT>
+inline int
+syhegvx_scratchpad_size(sycl::queue &q, int itype, oneapi::mkl::job jobz,
+                        oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
+                        int n, int lda, int ldb, ValueT vl, ValueT vu, int il,
+                        int iu, int *device_ws_size) {
+  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::syhegvx_scratchpad_size_impl>(
+      q, detail::get_library_data_t_from_type<T>(), nullptr,
+      "sygvx_scratchpad_size/hegvx_scratchpad_size", q, itype, compz_jobz,
+      range, uplo, n, lda, ldb, &vl, &vu, il, iu, device_ws_size_tmp);
+  *device_ws_size = (int)device_ws_size_tmp;
+  return ret;
+}
+
+/// Computes selected eigenvalues and, optionally, eigenvectors of a real
+/// generalized symmetric/Hermitian definite eigenproblem.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] itype Must be 1, 2 or 3.
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in, out] a The input matrix A. On exit, the lower or upper triangle is
+/// overwritten.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in, out] b The input matrix B.
+/// \param [in] ldb The leading dimension of the matrix B.
+/// \param [in] vl If range == rangev::values, the lower bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] vu If range == rangev::values, the upper bound of the interval
+/// to be searched for eigenvalues
+/// \param [in] il If range == rangev::indices, the indices of the smallest
+/// eigenvalue to be returned.
+/// \param [in] iu If range == rangev::indices, the indices of the largest
+/// eigenvalue to be returned.
+/// \param [out] m The total number of eigenvalues found.
+/// \param [out] w The eigenvalues of the matrix A in ascending order.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The device workspace size as a number of
+/// elements of type \tparam T.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+template <typename T, typename ValueT>
+inline int syhegvx(sycl::queue &q, int itype, oneapi::mkl::job jobz,
+                   oneapi::mkl::rangev range, oneapi::mkl::uplo uplo, int n,
+                   T *a, int lda, T *b, int ldb, ValueT vl, ValueT vu, int il,
+                   int iu, int *m, ValueT *w, T *device_ws, int device_ws_size,
+                   int *info) {
+  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
+  std::int64_t m64;
+  int ret = detail::lapack_shim<detail::syhegvx_impl>(
+      q, detail::get_library_data_t_from_type<T>(), info, "sygvx/hegvx", q,
+      itype, compz_jobz, range, uplo, n, a, lda, b, ldb, &vl, &vu, il, iu, &m64,
+      w, device_ws, device_ws_size, info);
+  q.wait();
+  *m = (int)m64;
+  return ret;
+}
+
+/// Computes the size of workspace memory of sygvd/hegvd function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] itype Must be 1, 2 or 3.
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] ldb The leading dimension of the matrix B.
+/// \param [in] device_ws_size The device workspace size as a number of
+/// elements of type \tparam T.
+template <typename T>
+inline int syhegvd_scratchpad_size(sycl::queue &q, int itype,
+                                   oneapi::mkl::job jobz,
+                                   oneapi::mkl::uplo uplo, int n, int lda,
+                                   int ldb, int *device_ws_size) {
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::syhegvd_scratchpad_size_impl>(
+      q, detail::get_library_data_t_from_type<T>(), nullptr,
+      "sygvd_scratchpad_size/hegvd_scratchpad_size", q, itype, jobz, uplo, n,
+      lda, ldb, device_ws_size_tmp);
+  *device_ws_size = (int)device_ws_size_tmp;
+  return ret;
+}
+
+/// Computes all eigenvalues and, optionally, eigenvectors of a real generalized
+/// symmetric/Hermitian definite eigenproblem using a divide and conquer method.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] itype Must be 1, 2 or 3.
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in, out] a The input matrix A. On exit, it is overwritten by eigenvectors.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in, out] b The input matrix B.
+/// \param [in] ldb The leading dimension of the matrix B.
+/// \param [out] w The eigenvalues of the matrix A in ascending order.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The device workspace size as a number of
+/// elements of type \tparam T.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+template <typename T, typename ValueT>
+inline int syhegvd(sycl::queue &q, int itype, oneapi::mkl::job jobz,
+                   oneapi::mkl::uplo uplo, int n, T *a, int lda, T *b, int ldb,
+                   ValueT *w, T *device_ws, int device_ws_size, int *info) {
+  return detail::lapack_shim<detail::syhegvd_impl>(
+      q, detail::get_library_data_t_from_type<T>(), info, "sygvd/hegvd", q,
+      itype, jobz, uplo, n, a, lda, b, ldb, w, device_ws, device_ws_size, info);
+}
+
+/// Computes the size of workspace memory of syev/heev function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [out] device_ws_size The device workspace size as a number of
+/// elements of type \tparam T.
+template <typename T>
+inline int syheev_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
+                                  oneapi::mkl::uplo uplo, int n, int lda,
+                                  int *device_ws_size) {
+  std::size_t device_ws_size_tmp;
+  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
+  int ret = detail::lapack_shim<detail::syheev_scratchpad_size_impl>(
+      q, detail::get_library_data_t_from_type<T>(), nullptr,
+      "syev_scratchpad_size/heev_scratchpad_size", q, compz_jobz, uplo, n, lda,
+      device_ws_size_tmp);
+  *device_ws_size = (int)device_ws_size_tmp;
+  return ret;
+}
+
+/// Computes all eigenvalues and, optionally, eigenvectors of a real symmetric
+/// or Hermitian matrix.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in, out] a The input matrix A. On exit, it is overwritten by
+/// eigenvectors.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [out] w The eigenvalues of the matrix A in ascending order.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The device workspace size as a number of
+/// elements of type \tparam T.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+template <typename T, typename ValueT>
+inline int syheev(sycl::queue &q, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
+                  int n, T *a, int lda, ValueT *w, T *device_ws,
+                  int device_ws_size, int *info) {
+  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
+  return detail::lapack_shim<detail::syheev_impl>(
+      q, detail::get_library_data_t_from_type<T>(), info, "syev/heev", q,
+      compz_jobz, uplo, n, a, lda, w, device_ws, device_ws_size, info);
+}
+
+/// Computes the size of workspace memory of syevd/heevd function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] w_type The data type of the eigenvalues.
+/// \param [out] device_ws_size The device workspace in bytes.
+/// \param [out] host_ws_size The host workspace size in bytes. Currently the
+/// value is always zero.
+inline int syheevd_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
+                                   oneapi::mkl::uplo uplo, std::int64_t n,
+                                   library_data_t a_type, std::int64_t lda,
+                                   library_data_t w_type,
+                                   std::size_t *device_ws_size,
+                                   std::size_t *host_ws_size = nullptr) {
+  if (host_ws_size)
+    *host_ws_size = 0;
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::syheevd_scratchpad_size_impl>(
+      q, a_type, nullptr, "syevd_scratchpad_size/heevd_scratchpad_size", q,
+      jobz, uplo, n, a_type, lda, device_ws_size_tmp);
+  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
+  return ret;
+}
+
+/// Computes all eigenvalues and, optionally, all eigenvectors of a real
+/// symmetric or Hermitian matrix using divide and conquer algorithm.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A. On exit, it is overwritten by
+/// eigenvectors.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] w_type The data type of the eigenvalues.
+/// \param [out] w The eigenvalues of the matrix A in ascending order.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The workspace size in bytes.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int syheevd(sycl::queue &q, oneapi::mkl::job jobz,
+                   oneapi::mkl::uplo uplo, std::int64_t n,
+                   library_data_t a_type, void *a, std::int64_t lda,
+                   library_data_t w_type, void *w, void *device_ws,
+                   std::size_t device_ws_size, int *info) {
+  std::size_t device_ws_size_in_element_number =
+      detail::byte_to_element_number(device_ws_size, a_type);
+  return detail::lapack_shim<detail::syheevd_impl>(
+      q, a_type, info, "syevd/heevd", q, jobz, uplo, n, a_type, a, lda, w,
+      device_ws, device_ws_size_in_element_number, info);
+}
+
+/// Computes the size of workspace memory of syevd/heevd function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] w_type The data type of the eigenvalues.
+/// \param [out] device_ws_size The device workspace size as a number of
+/// elements of type \tparam T.
+template <typename T>
+inline int syheevd_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
+                                   oneapi::mkl::uplo uplo, std::int64_t n,
+                                   std::int64_t lda, int *device_ws_size) {
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::syheevd_scratchpad_size_impl>(
+      q, detail::get_library_data_t_from_type<T>(), nullptr,
+      "syevd_scratchpad_size/heevd_scratchpad_size", q, jobz, uplo, n,
+      detail::get_library_data_t_from_type<T>(), lda, device_ws_size_tmp);
+  *device_ws_size = (int)device_ws_size_tmp;
+  return ret;
+}
+
+/// Computes all eigenvalues and, optionally, all eigenvectors of a real
+/// symmetric or Hermitian matrix using divide and conquer algorithm.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] jobz Must be job::novec or job::vec.
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] n The order of the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A. On exit, it is overwritten by
+/// eigenvectors.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] w_type The data type of the eigenvalues.
+/// \param [out] w The eigenvalues of the matrix A in ascending order.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The workspace size as a number of
+/// elements of type \tparam T.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+template <typename T, typename ValueT>
+inline int syheevd(sycl::queue &q, oneapi::mkl::job jobz,
+                   oneapi::mkl::uplo uplo, std::int64_t n, T *a,
+                   std::int64_t lda, ValueT *w, T *device_ws,
+                   int device_ws_size, int *info) {
+  return detail::lapack_shim<detail::syheevd_impl>(
+      q, detail::get_library_data_t_from_type<T>(), info, "syevd/heevd", q,
+      jobz, uplo, n, detail::get_library_data_t_from_type<T>(), a, lda, w,
+      device_ws, device_ws_size, info);
+}
+
+/// Computes the size of workspace memory of trtri function.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] diag Must be diag::nonunit or diag::unit.
+/// \param [in] n The order of the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [out] device_ws_size The device workspace in bytes.
+/// \param [out] host_ws_size The host workspace size in bytes. Currently the
+/// value is always zero.
+inline int trtri_scratchpad_size(sycl::queue &q, oneapi::mkl::uplo uplo,
+                                 oneapi::mkl::diag diag, std::int64_t n,
+                                 library_data_t a_type, std::int64_t lda,
+                                 std::size_t *device_ws_size,
+                                 std::size_t *host_ws_size = nullptr) {
+  if (host_ws_size)
+    *host_ws_size = 0;
+  std::size_t device_ws_size_tmp;
+  int ret = detail::lapack_shim<detail::trtri_scratchpad_size_impl>(
+      q, a_type, nullptr, "trtri_scratchpad_size", q, uplo, diag, n, a_type,
+      lda, device_ws_size_tmp);
+  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
+  return ret;
+}
+
+/// Computes the inverse of a triangular matrix.
+/// \return Returns 0 if no synchronous exception, otherwise returns 1.
+/// \param [in] q Device queue where computation will be performed. It must
+/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
+/// not defined).
+/// \param [in] uplo Must be uplo::upper or uplo::lower.
+/// \param [in] diag Must be diag::nonunit or diag::unit.
+/// \param [in] n The order of the matrix A.
+/// \param [in] a_type The data type of the matrix A.
+/// \param [in, out] a The input matrix A. On exit, it is overwritten by
+/// the inverse matrix of A.
+/// \param [in] lda The leading dimension of the matrix A.
+/// \param [in] device_ws The workspace.
+/// \param [in] device_ws_size The workspace size in bytes.
+/// \param [out] info If lapack synchronous exception is caught, the value
+/// returned from info() method of the exception is set to \p info.
+inline int trtri(sycl::queue &q, oneapi::mkl::uplo uplo, oneapi::mkl::diag diag,
+                 std::int64_t n, library_data_t a_type, void *a,
+                 std::int64_t lda, void *device_ws, std::size_t device_ws_size,
+                 int *info) {
+#ifdef DPCT_USM_LEVEL_NONE
+  throw std::runtime_error("this API is unsupported when USM level is none");
+#else
+  std::size_t device_ws_size_in_element_number =
+      detail::byte_to_element_number(device_ws_size, a_type);
+  return detail::lapack_shim<detail::trtri_impl>(
+      q, a_type, info, "trtri", q, uplo, diag, n, a_type, a, lda, device_ws,
+      device_ws_size_in_element_number, info);
+#endif
+}
+} // namespace lapack
+} // namespace dpct
+
+#endif // __DPCT_LAPACK_UTILS_HPP__
diff --git a/dpct/lib_common_utils.hpp b/dpct/lib_common_utils.hpp
new file mode 100644
index 0000000000000..62366da986cdc
--- /dev/null
+++ b/dpct/lib_common_utils.hpp
@@ -0,0 +1,174 @@
+//==---- lib_common_utils.hpp ---------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_LIB_COMMON_UTILS_HPP__
+#define __DPCT_LIB_COMMON_UTILS_HPP__
+
+#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#include "memory.hpp"
+#include "util.hpp"
+
+namespace dpct {
+namespace detail {
+template <typename T> inline auto get_memory(const void *x) {
+  T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
+#ifdef DPCT_USM_LEVEL_NONE
+  return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
+#else
+  return new_x;
+#endif
+}
+
+template <typename T>
+inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q) {
+  using Ty = typename DataType<T>::T2;
+  Ty s_h;
+  if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
+    detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
+        .wait();
+  else
+    s_h = *reinterpret_cast<const Ty *>(s);
+  return s_h;
+}
+} // namespace detail
+
+enum class version_field : int { major, minor, update, patch };
+
+/// Returns the requested field of Intel(R) oneAPI Math Kernel Library version.
+/// \param field The version information field (major, minor, update or patch).
+/// \param result The result value.
+inline void mkl_get_version(version_field field, int *result) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+  MKLVersion version;
+  mkl_get_version(&version);
+  if (version_field::major == field) {
+    *result = version.MajorVersion;
+  } else if (version_field::minor == field) {
+    *result = version.MinorVersion;
+  } else if (version_field::update == field) {
+    *result = version.UpdateVersion;
+  } else if (version_field::patch == field) {
+    *result = 0;
+  } else {
+    throw std::runtime_error("unknown field");
+  }
+#endif
+}
+
+enum class library_data_t : unsigned char {
+  real_float = 0,
+  complex_float,
+  real_double,
+  complex_double,
+  real_half,
+  complex_half,
+  real_bfloat16,
+  complex_bfloat16,
+  real_int4,
+  complex_int4,
+  real_uint4,
+  complex_uint4,
+  real_int8,
+  complex_int8,
+  real_uint8,
+  complex_uint8,
+  real_int16,
+  complex_int16,
+  real_uint16,
+  complex_uint16,
+  real_int32,
+  complex_int32,
+  real_uint32,
+  complex_uint32,
+  real_int64,
+  complex_int64,
+  real_uint64,
+  complex_uint64,
+  real_int8_4,
+  real_int8_32,
+  real_uint8_4,
+  library_data_t_size
+};
+
+namespace detail {
+template <typename ArgT>
+inline constexpr std::uint64_t get_type_combination_id(ArgT Val) {
+  static_assert((unsigned char)library_data_t::library_data_t_size <=
+                    std::numeric_limits<unsigned char>::max() &&
+                "library_data_t size exceeds limit.");
+  static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
+  return (std::uint64_t)Val;
+}
+
+template <typename FirstT, typename... RestT>
+inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
+                                                       RestT... RestVal) {
+  static_assert((std::uint8_t)library_data_t::library_data_t_size <=
+                    std::numeric_limits<unsigned char>::max() &&
+                "library_data_t size exceeds limit.");
+  static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
+  static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
+  return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
+}
+
+inline constexpr std::size_t library_data_size[] = {
+    8 * sizeof(float),                    // real_float
+    8 * sizeof(std::complex<float>),      // complex_float
+    8 * sizeof(double),                   // real_double
+    8 * sizeof(std::complex<double>),     // complex_double
+    8 * sizeof(sycl::half),               // real_half
+    8 * sizeof(std::complex<sycl::half>), // complex_half
+    16,                                   // real_bfloat16
+    16 * 2,                               // complex_bfloat16
+    4,                                    // real_int4
+    4 * 2,                                // complex_int4
+    4,                                    // real_uint4
+    4 * 2,                                // complex_uint4
+    8,                                    // real_int8
+    8 * 2,                                // complex_int8
+    8,                                    // real_uint8
+    8 * 2,                                // complex_uint8
+    16,                                   // real_int16
+    16 * 2,                               // complex_int16
+    16,                                   // real_uint16
+    16 * 2,                               // complex_uint16
+    32,                                   // real_int32
+    32 * 2,                               // complex_int32
+    32,                                   // real_uint32
+    32 * 2,                               // complex_uint32
+    64,                                   // real_int64
+    64 * 2,                               // complex_int64
+    64,                                   // real_uint64
+    64 * 2,                               // complex_uint64
+    8,                                    // real_int8_4
+    8,                                    // real_int8_32
+    8                                     // real_uint8_4
+};
+} // namespace detail
+
+#ifdef DPCT_USM_LEVEL_NONE
+/// Cast a "rvalue reference to a temporary object" to an "lvalue reference to
+/// that temporary object".
+/// CAUTION:
+/// The returned lvalue reference is available only before the last step in
+/// evaluating the full-expression that contains this function call.
+/// \param [in] temporary_object The rvalue reference to a temporary object.
+/// \returns The lvalue reference to that temporary object.
+template <typename T>
+inline typename std::enable_if_t<std::is_rvalue_reference_v<T &&>, T &>
+rvalue_ref_to_lvalue_ref(T &&temporary_object) {
+  return temporary_object;
+}
+#endif
+} // namespace dpct
+
+#endif // __DPCT_LIB_COMMON_UTILS_HPP__
diff --git a/dpct/math.hpp b/dpct/math.hpp
new file mode 100644
index 0000000000000..8d400fa682a00
--- /dev/null
+++ b/dpct/math.hpp
@@ -0,0 +1,1814 @@
+//==---- math.hpp ---------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_MATH_HPP__
+#define __DPCT_MATH_HPP__
+
+#include <limits>
+#include <climits>
+#include <sycl/sycl.hpp>
+#include <type_traits>
+
+namespace dpct {
+namespace detail {
+template <typename VecT, class BinaryOperation, class = void>
+class vectorized_binary {
+public:
+  inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) {
+    VecT v4;
+    for (size_t i = 0; i < v4.size(); ++i) {
+      v4[i] = binary_op(a[i], b[i]);
+    }
+    return v4;
+  }
+};
+template <typename VecT, class BinaryOperation>
+class vectorized_binary<
+    VecT, BinaryOperation,
+    std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>> {
+public:
+  inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) {
+    return binary_op(a, b).template as<VecT>();
+  }
+};
+
+template <typename T> inline bool isnan(const T a) { return sycl::isnan(a); }
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+inline bool isnan(const sycl::ext::oneapi::bfloat16 a) {
+  return sycl::ext::oneapi::experimental::isnan(a);
+}
+#endif
+} // namespace detail
+
+/// Compute fast_length for variable-length array
+/// \param [in] a The array
+/// \param [in] len Length of the array
+/// \returns The computed fast_length
+inline float fast_length(const float *a, int len) {
+  switch (len) {
+  case 1:
+    return a[0];
+  case 2:
+    return sycl::fast_length(sycl::float2(a[0], a[1]));
+  case 3:
+    return sycl::fast_length(sycl::float3(a[0], a[1], a[2]));
+  case 4:
+    return sycl::fast_length(sycl::float4(a[0], a[1], a[2], a[3]));
+  case 0:
+    return 0;
+  default:
+    float f = 0;
+    for (int i = 0; i < len; ++i)
+      f += a[i] * a[i];
+    return sycl::sqrt(f);
+  }
+}
+
+/// Calculate the square root of the input array.
+/// \param [in] a The array pointer
+/// \param [in] len Length of the array
+/// \returns The square root
+template <typename T> inline T length(const T *a, const int len) {
+  switch (len) {
+  case 1:
+    return a[0];
+  case 2:
+    return sycl::length(sycl::vec<T, 2>(a[0], a[1]));
+  case 3:
+    return sycl::length(sycl::vec<T, 3>(a[0], a[1], a[2]));
+  case 4:
+    return sycl::length(sycl::vec<T, 4>(a[0], a[1], a[2], a[3]));
+  default:
+    T ret = 0;
+    for (int i = 0; i < len; ++i)
+      ret += a[i] * a[i];
+    return sycl::sqrt(ret);
+  }
+}
+
+/// Returns min(max(val, min_val), max_val)
+/// \param [in] val The input value
+/// \param [in] min_val The minimum value
+/// \param [in] max_val The maximum value
+/// \returns the value between min_val and max_val
+template <typename T> inline T clamp(T val, T min_val, T max_val) {
+  return sycl::clamp(val, min_val, max_val);
+}
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+template <>
+inline sycl::ext::oneapi::bfloat16 clamp(sycl::ext::oneapi::bfloat16 val,
+                                         sycl::ext::oneapi::bfloat16 min_val,
+                                         sycl::ext::oneapi::bfloat16 max_val) {
+  if (val < min_val)
+    return min_val;
+  if (val > max_val)
+    return max_val;
+  return val;
+}
+#endif
+template <typename T>
+inline sycl::marray<T, 2> clamp(sycl::marray<T, 2> val,
+                                sycl::marray<T, 2> min_val,
+                                sycl::marray<T, 2> max_val) {
+  return {clamp(val[0], min_val[0], max_val[0]),
+          clamp(val[1], min_val[1], max_val[1])};
+}
+
+/// Performs comparison.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename T, class BinaryOperation>
+inline std::enable_if_t<
+    std::is_same_v<std::invoke_result_t<BinaryOperation, T, T>, bool>, bool>
+compare(const T a, const T b, const BinaryOperation binary_op) {
+  return binary_op(a, b);
+}
+template <typename T>
+inline std::enable_if_t<
+    std::is_same_v<std::invoke_result_t<std::not_equal_to<>, T, T>, bool>, bool>
+compare(const T a, const T b, const std::not_equal_to<> binary_op) {
+  return !detail::isnan(a) && !detail::isnan(b) && binary_op(a, b);
+}
+
+/// Performs unordered comparison.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename T, class BinaryOperation>
+inline std::enable_if_t<
+    std::is_same_v<std::invoke_result_t<BinaryOperation, T, T>, bool>, bool>
+unordered_compare(const T a, const T b, const BinaryOperation binary_op) {
+  return detail::isnan(a) || detail::isnan(b) || binary_op(a, b);
+}
+
+/// Performs 2 element comparison and return true if both results are true.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename T, class BinaryOperation>
+inline std::enable_if_t<T::size() == 2, bool>
+compare_both(const T a, const T b, const BinaryOperation binary_op) {
+  return compare(a[0], b[0], binary_op) && compare(a[1], b[1], binary_op);
+}
+
+/// Performs 2 element unordered comparison and return true if both results are
+/// true.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename T, class BinaryOperation>
+inline std::enable_if_t<T::size() == 2, bool>
+unordered_compare_both(const T a, const T b, const BinaryOperation binary_op) {
+  return unordered_compare(a[0], b[0], binary_op) &&
+         unordered_compare(a[1], b[1], binary_op);
+}
+
+/// Performs 2 element comparison.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename T, class BinaryOperation>
+inline std::enable_if_t<T::size() == 2, T>
+compare(const T a, const T b, const BinaryOperation binary_op) {
+  return {compare(a[0], b[0], binary_op), compare(a[1], b[1], binary_op)};
+}
+
+/// Performs 2 elements comparison, compare result of each element is 0 (false)
+/// or 0xffff (true), returns an unsigned int by composing compare result of two
+/// elements.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename T, class BinaryOperation>
+inline unsigned compare_mask(const sycl::vec<T, 2> a, const sycl::vec<T, 2> b,
+                             const BinaryOperation binary_op) {
+  return sycl::vec<short, 2>(-compare(a[0], b[0], binary_op),
+                             -compare(a[1], b[1], binary_op))
+      .as<sycl::vec<unsigned, 1>>();
+}
+template <typename T, class BinaryOperation>
+inline unsigned compare_mask(const sycl::marray<T, 2> a,
+                             const sycl::marray<T, 2> b,
+                             const BinaryOperation binary_op) {
+  return sycl::vec<short, 2>(-compare(a[0], b[0], binary_op),
+                             -compare(a[1], b[1], binary_op))
+      .as<sycl::vec<unsigned, 1>>();
+}
+
+/// Performs 2 element unordered comparison.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename T, class BinaryOperation>
+inline std::enable_if_t<T::size() == 2, T>
+unordered_compare(const T a, const T b, const BinaryOperation binary_op) {
+  return {unordered_compare(a[0], b[0], binary_op),
+          unordered_compare(a[1], b[1], binary_op)};
+}
+
+/// Performs 2 elements unordered comparison, compare result of each element is
+/// 0 (false) or 0xffff (true), returns an unsigned int by composing compare
+/// result of two elements.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] binary_op functor that implements the binary operation
+/// \returns the comparison result
+template <typename T, class BinaryOperation>
+inline unsigned unordered_compare_mask(const sycl::vec<T, 2> a,
+                                       const sycl::vec<T, 2> b,
+                                       const BinaryOperation binary_op) {
+  return sycl::vec<short, 2>(-unordered_compare(a[0], b[0], binary_op),
+                             -unordered_compare(a[1], b[1], binary_op))
+      .as<sycl::vec<unsigned, 1>>();
+}
+template <typename T, class BinaryOperation>
+inline unsigned unordered_compare_mask(const sycl::marray<T, 2> a,
+                                       const sycl::marray<T, 2> b,
+                                       const BinaryOperation binary_op) {
+  return sycl::vec<short, 2>(-unordered_compare(a[0], b[0], binary_op),
+                             -unordered_compare(a[1], b[1], binary_op))
+      .as<sycl::vec<unsigned, 1>>();
+}
+
+/// Bitfield-extract.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline std::enable_if_t<std::is_unsigned_v<T>, T>
+bfe(const T source, const uint32_t bit_start, const uint32_t num_bits) {
+  const T mask = (T{1} << num_bits) - 1;
+  return (source >> bit_start) & mask;
+}
+
+/// Bitfield-extract with boundary checking.
+///
+/// Extract bit field from \param source and return the zero or sign-extended
+/// result. Source \param bit_start gives the bit field starting bit position,
+/// and source \param num_bits gives the bit field length in bits.
+///
+/// The result is padded with the sign bit of the extracted field. If the start
+/// position is beyond the msb of the input, the result was filled with the
+/// replicated sign bit of the extracted field.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline std::enable_if_t<std::is_integral_v<T>, T>
+bfe_safe(const T source, const uint32_t bit_start, const uint32_t num_bits) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, int8_t> || std::is_same_v<T, int16_t> ||
+                std::is_same_v<T, int32_t>) {
+    int32_t res{};
+    asm volatile("bfe.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"((int32_t)source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint8_t> ||
+                       std::is_same_v<T, uint16_t> ||
+                       std::is_same_v<T, uint32_t>) {
+    uint32_t res{};
+    asm volatile("bfe.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"((uint32_t)source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    T res{};
+    asm volatile("bfe.s64 %0, %1, %2, %3;"
+                 : "=l"(res)
+                 : "l"(source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    T res{};
+    asm volatile("bfe.u64 %0, %1, %2, %3;"
+                 : "=l"(res)
+                 : "l"(source), "r"(bit_start), "r"(num_bits));
+    return res;
+  }
+#endif
+  const uint32_t bit_width = CHAR_BIT * sizeof(T);
+  const uint32_t pos = std::min(bit_start, bit_width);
+  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
+  if constexpr (std::is_signed_v<T>) {
+    const T mask = (T{1} << len) - 1;
+
+    // Find the sign-bit, the result is padded with the sign bit of the
+    // extracted field.
+    //
+    // sign_bit = len == 0 ? 0 : source[min(pos + len - 1, bit_width - 1)]
+    const uint32_t sign_bit_pos = std::min(pos + len - 1, bit_width - 1);
+    const T sign_bit = len != 0 && ((source >> sign_bit_pos) & 1);
+    const T sign_bit_padding = (-sign_bit & ~mask);
+    return ((source >> pos) & mask) | sign_bit_padding;
+  } else {
+    return dpct::bfe(source, pos, len);
+  }
+}
+
+/// Bitfield-insert.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline std::enable_if_t<std::is_unsigned_v<T>, T>
+bfi(const T x, const T y, const uint32_t bit_start, const uint32_t num_bits) {
+  constexpr unsigned bit_width = CHAR_BIT * sizeof(T);
+
+  // if bit_start > bit_width || len == 0, should return y.
+  const uint32_t ignore_bfi = bit_start > bit_width || num_bits == 0;
+  T extract_bitfield_mask = (~(T{0}) >> (bit_width - num_bits)) << bit_start;
+  T clean_bitfield_mask = ~extract_bitfield_mask;
+  return (y & (-ignore_bfi | clean_bitfield_mask)) |
+         (~-ignore_bfi & ((x << bit_start) & extract_bitfield_mask));
+}
+
+/// Bitfield-insert with boundary checking.
+///
+/// Align and insert a bit field from \param x into \param y . Source \param
+/// bit_start gives the starting bit position for the insertion, and source
+/// \param num_bits gives the bit field length in bits.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline std::enable_if_t<std::is_unsigned_v<T>, T>
+bfi_safe(const T x, const T y, const uint32_t bit_start,
+         const uint32_t num_bits) {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, uint16_t> ||
+                std::is_same_v<T, uint32_t>) {
+    uint32_t res{};
+    asm volatile("bfi.b32 %0, %1, %2, %3, %4;"
+                 : "=r"(res)
+                 : "r"((uint32_t)x), "r"((uint32_t)y), "r"(bit_start),
+                   "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    uint64_t res{};
+    asm volatile("bfi.b64 %0, %1, %2, %3, %4;"
+                 : "=l"(res)
+                 : "l"(x), "l"(y), "r"(bit_start), "r"(num_bits));
+    return res;
+  }
+#endif
+  constexpr unsigned bit_width = CHAR_BIT * sizeof(T);
+  const uint32_t pos = std::min(bit_start, bit_width);
+  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
+  return dpct::bfi(x, y, pos, len);
+}
+
+/// Determine whether 2 element value is NaN.
+/// \param [in] a The input value
+/// \returns the comparison result
+template <typename T>
+inline std::enable_if_t<T::size() == 2, T> isnan(const T a) {
+  return {detail::isnan(a[0]), detail::isnan(a[1])};
+}
+
+/// Emulated function for __funnelshift_l
+inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
+                                  unsigned int shift) {
+  return (sycl::upsample(high, low) << (shift & 31U)) >> 32;
+}
+
+/// Emulated function for __funnelshift_lc
+inline unsigned int funnelshift_lc(unsigned int low, unsigned int high,
+                                   unsigned int shift) {
+  return (sycl::upsample(high, low) << sycl::min(shift, 32U)) >> 32;
+}
+
+/// Emulated function for __funnelshift_r
+inline unsigned int funnelshift_r(unsigned int low, unsigned int high,
+                                  unsigned int shift) {
+  return (sycl::upsample(high, low) >> (shift & 31U)) & 0xFFFFFFFF;
+}
+
+/// Emulated function for __funnelshift_rc
+inline unsigned int funnelshift_rc(unsigned int low, unsigned int high,
+                                   unsigned int shift) {
+  return (sycl::upsample(high, low) >> sycl::min(shift, 32U)) & 0xFFFFFFFF;
+}
+
+/// cbrt function wrapper.
+template <typename T> inline T cbrt(T val) { return sycl::cbrt((T)val); }
+
+// min function overloads.
+// For floating-point types, `float` or `double` arguments are acceptable.
+// For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
+// `std::int64_t` type arguments are acceptable.
+inline double min(const double a, const float b) {
+  return sycl::fmin(a, static_cast<double>(b));
+}
+inline double min(const float a, const double b) {
+  return sycl::fmin(static_cast<double>(a), b);
+}
+inline float min(const float a, const float b) { return sycl::fmin(a, b); }
+inline double min(const double a, const double b) { return sycl::fmin(a, b); }
+inline std::uint32_t min(const std::uint32_t a, const std::int32_t b) {
+  return sycl::min(a, static_cast<std::uint32_t>(b));
+}
+inline std::uint32_t min(const std::int32_t a, const std::uint32_t b) {
+  return sycl::min(static_cast<std::uint32_t>(a), b);
+}
+inline std::int32_t min(const std::int32_t a, const std::int32_t b) {
+  return sycl::min(a, b);
+}
+inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b) {
+  return sycl::min(a, b);
+}
+inline std::uint64_t min(const std::uint64_t a, const std::int64_t b) {
+  return sycl::min(a, static_cast<std::uint64_t>(b));
+}
+inline std::uint64_t min(const std::int64_t a, const std::uint64_t b) {
+  return sycl::min(static_cast<std::uint64_t>(a), b);
+}
+inline std::int64_t min(const std::int64_t a, const std::int64_t b) {
+  return sycl::min(a, b);
+}
+inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b) {
+  return sycl::min(a, b);
+}
+inline std::uint64_t min(const std::uint64_t a, const std::int32_t b) {
+  return sycl::min(a, static_cast<std::uint64_t>(b));
+}
+inline std::uint64_t min(const std::int32_t a, const std::uint64_t b) {
+  return sycl::min(static_cast<std::uint64_t>(a), b);
+}
+inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b) {
+  return sycl::min(a, static_cast<std::uint64_t>(b));
+}
+inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b) {
+  return sycl::min(static_cast<std::uint64_t>(a), b);
+}
+// max function overloads.
+// For floating-point types, `float` or `double` arguments are acceptable.
+// For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
+// `std::int64_t` type arguments are acceptable.
+inline double max(const double a, const float b) {
+  return sycl::fmax(a, static_cast<double>(b));
+}
+inline double max(const float a, const double b) {
+  return sycl::fmax(static_cast<double>(a), b);
+}
+inline float max(const float a, const float b) { return sycl::fmax(a, b); }
+inline double max(const double a, const double b) { return sycl::fmax(a, b); }
+inline std::uint32_t max(const std::uint32_t a, const std::int32_t b) {
+  return sycl::max(a, static_cast<std::uint32_t>(b));
+}
+inline std::uint32_t max(const std::int32_t a, const std::uint32_t b) {
+  return sycl::max(static_cast<std::uint32_t>(a), b);
+}
+inline std::int32_t max(const std::int32_t a, const std::int32_t b) {
+  return sycl::max(a, b);
+}
+inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b) {
+  return sycl::max(a, b);
+}
+inline std::uint64_t max(const std::uint64_t a, const std::int64_t b) {
+  return sycl::max(a, static_cast<std::uint64_t>(b));
+}
+inline std::uint64_t max(const std::int64_t a, const std::uint64_t b) {
+  return sycl::max(static_cast<std::uint64_t>(a), b);
+}
+inline std::int64_t max(const std::int64_t a, const std::int64_t b) {
+  return sycl::max(a, b);
+}
+inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b) {
+  return sycl::max(a, b);
+}
+inline std::uint64_t max(const std::uint64_t a, const std::int32_t b) {
+  return sycl::max(a, static_cast<std::uint64_t>(b));
+}
+inline std::uint64_t max(const std::int32_t a, const std::uint64_t b) {
+  return sycl::max(static_cast<std::uint64_t>(a), b);
+}
+inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b) {
+  return sycl::max(a, static_cast<std::uint64_t>(b));
+}
+inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b) {
+  return sycl::max(static_cast<std::uint64_t>(a), b);
+}
+
+// pow functions overload.
+inline float pow(const float a, const int b) { return sycl::pown(a, b); }
+inline double pow(const double a, const int b) { return sycl::pown(a, b); }
+inline float pow(const float a, const float b) { return sycl::pow(a, b); }
+inline double pow(const double a, const double b) { return sycl::pow(a, b); }
+template <typename T, typename U>
+inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
+pow(const T a, const U b) {
+  return sycl::pow(a, static_cast<T>(b));
+}
+template <typename T, typename U>
+inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
+pow(const T a, const U b) {
+  return sycl::pow(static_cast<double>(a), static_cast<double>(b));
+}
+
+namespace detail {
+template <typename T>
+constexpr bool is_floating_point =
+    std::disjunction_v<std::is_floating_point<T>, std::is_same<T, sycl::half>
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+                       ,
+                       std::is_same<T, sycl::ext::oneapi::bfloat16>
+#endif
+                       >;
+} // namespace detail
+
+/// Performs relu saturation.
+/// \param [in] a The input value
+/// \returns the relu saturation result
+template <typename T> inline T relu(T a) {
+  T zero{};
+  if constexpr (detail::is_floating_point<T>)
+    return !detail::isnan(a) && a < zero ? zero : a;
+  else
+    return a < zero ? zero : a;
+}
+template <class T> inline sycl::vec<T, 2> relu(const sycl::vec<T, 2> a) {
+  return {relu(a[0]), relu(a[1])};
+}
+template <class T> inline sycl::marray<T, 2> relu(const sycl::marray<T, 2> a) {
+  return {relu(a[0]), relu(a[1])};
+}
+
+/// Performs complex number multiply addition.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns the operation result
+template <typename T>
+inline sycl::vec<T, 2> complex_mul_add(const sycl::vec<T, 2> a,
+                                       const sycl::vec<T, 2> b,
+                                       const sycl::vec<T, 2> c) {
+  return sycl::vec<T, 2>{a[0] * b[0] - a[1] * b[1] + c[0],
+                         a[0] * b[1] + a[1] * b[0] + c[1]};
+}
+template <typename T>
+inline sycl::marray<T, 2> complex_mul_add(const sycl::marray<T, 2> a,
+                                          const sycl::marray<T, 2> b,
+                                          const sycl::marray<T, 2> c) {
+  return sycl::marray<T, 2>{a[0] * b[0] - a[1] * b[1] + c[0],
+                            a[0] * b[1] + a[1] * b[0] + c[1]};
+}
+
+/// Performs 2 elements comparison and returns the bigger one. If either of
+/// inputs is NaN, then return NaN.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns the bigger value
+template <typename T> inline T fmax_nan(const T a, const T b) {
+  if (detail::isnan(a) || detail::isnan(b))
+    return NAN;
+  return sycl::fmax(a, b);
+}
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+template <>
+inline sycl::ext::oneapi::bfloat16
+fmax_nan(const sycl::ext::oneapi::bfloat16 a,
+         const sycl::ext::oneapi::bfloat16 b) {
+  if (detail::isnan(a) || detail::isnan(b))
+    return NAN;
+  return sycl::fmax(float(a), float(b));
+}
+#endif
+template <typename T>
+inline sycl::vec<T, 2> fmax_nan(const sycl::vec<T, 2> a,
+                                const sycl::vec<T, 2> b) {
+  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
+}
+template <typename T>
+inline sycl::marray<T, 2> fmax_nan(const sycl::marray<T, 2> a,
+                                   const sycl::marray<T, 2> b) {
+  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
+}
+
+/// Performs 2 elements comparison and returns the smaller one. If either of
+/// inputs is NaN, then return NaN.
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns the smaller value
+template <typename T> inline T fmin_nan(const T a, const T b) {
+  if (detail::isnan(a) || detail::isnan(b))
+    return NAN;
+  return sycl::fmin(a, b);
+}
+#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
+template <>
+inline sycl::ext::oneapi::bfloat16
+fmin_nan(const sycl::ext::oneapi::bfloat16 a,
+         const sycl::ext::oneapi::bfloat16 b) {
+  if (detail::isnan(a) || detail::isnan(b))
+    return NAN;
+  return sycl::fmin(float(a), float(b));
+}
+#endif
+template <typename T>
+inline sycl::vec<T, 2> fmin_nan(const sycl::vec<T, 2> a,
+                                const sycl::vec<T, 2> b) {
+  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
+}
+template <typename T>
+inline sycl::marray<T, 2> fmin_nan(const sycl::marray<T, 2> a,
+                                   const sycl::marray<T, 2> b) {
+  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
+}
+
+/// A sycl::abs wrapper functors.
+struct abs {
+  template <typename T> auto operator()(const T x) const {
+    return sycl::abs(x);
+  }
+};
+
+/// A sycl::abs_diff wrapper functors.
+struct abs_diff {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return sycl::abs_diff(x, y);
+  }
+};
+
+/// A sycl::add_sat wrapper functors.
+struct add_sat {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return sycl::add_sat(x, y);
+  }
+};
+
+/// A sycl::rhadd wrapper functors.
+struct rhadd {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return sycl::rhadd(x, y);
+  }
+};
+
+/// A sycl::hadd wrapper functors.
+struct hadd {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return sycl::hadd(x, y);
+  }
+};
+
+/// A sycl::max wrapper functors.
+struct maximum {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return sycl::max(x, y);
+  }
+};
+
+/// A sycl::min wrapper functors.
+struct minimum {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return sycl::min(x, y);
+  }
+};
+
+/// A sycl::sub_sat wrapper functors.
+struct sub_sat {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return sycl::sub_sat(x, y);
+  }
+};
+
+/// Compute vectorized binary operation value for two values, with each value
+/// treated as a vector type \p VecT.
+/// \tparam [in] VecT The type of the vector
+/// \tparam [in] BinaryOperation The binary operation class
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized binary operation value of the two values
+template <typename VecT, class BinaryOperation>
+inline unsigned vectorized_binary(unsigned a, unsigned b,
+                                  const BinaryOperation binary_op) {
+  sycl::vec<unsigned, 1> v0{a}, v1{b};
+  auto v2 = v0.as<VecT>();
+  auto v3 = v1.as<VecT>();
+  auto v4 =
+      detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
+  v0 = v4.template as<sycl::vec<unsigned, 1>>();
+  return v0;
+}
+
+/// Compute vectorized isgreater for two values, with each value treated as a
+/// vector type \p S.
+/// \tparam [in] S The type of the vector
+/// \tparam [in] T The type of the original values
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized greater than of the two values
+template <typename S, typename T> inline T vectorized_isgreater(T a, T b) {
+  sycl::vec<T, 1> v0{a}, v1{b};
+  auto v2 = v0.template as<S>();
+  auto v3 = v1.template as<S>();
+  auto v4 = v2 > v3;
+  v0 = v4.template as<sycl::vec<T, 1>>();
+  return v0;
+}
+
+/// Compute vectorized max for two values, with each value treated as a vector
+/// type \p S.
+/// \tparam [in] S The type of the vector
+/// \tparam [in] T The type of the original values
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized max of the two values
+template <typename S, typename T> inline T vectorized_max(T a, T b) {
+  sycl::vec<T, 1> v0{a}, v1{b};
+  auto v2 = v0.template as<S>();
+  auto v3 = v1.template as<S>();
+  auto v4 = sycl::max(v2, v3);
+  v0 = v4.template as<sycl::vec<T, 1>>();
+  return v0;
+}
+
+/// Compute vectorized min for two values, with each value treated as a vector
+/// type \p S.
+/// \tparam [in] S The type of the vector
+/// \tparam [in] T The type of the original values
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized min of the two values
+template <typename S, typename T> inline T vectorized_min(T a, T b) {
+  sycl::vec<T, 1> v0{a}, v1{b};
+  auto v2 = v0.template as<S>();
+  auto v3 = v1.template as<S>();
+  auto v4 = sycl::min(v2, v3);
+  v0 = v4.template as<sycl::vec<T, 1>>();
+  return v0;
+}
+
+/// Compute vectorized unary operation for a value, with the value treated as a
+/// vector type \p VecT.
+/// \tparam [in] VecT The type of the vector
+/// \tparam [in] UnaryOperation The unary operation class
+/// \param [in] a The input value
+/// \returns The vectorized unary operation value of the input value
+template <typename VecT, class UnaryOperation>
+inline unsigned vectorized_unary(unsigned a, const UnaryOperation unary_op) {
+  sycl::vec<unsigned, 1> v0{a};
+  auto v1 = v0.as<VecT>();
+  auto v2 = unary_op(v1);
+  v0 = v2.template as<sycl::vec<unsigned, 1>>();
+  return v0;
+}
+
+/// Compute vectorized absolute difference for two values without modulo
+/// overflow, with each value treated as a vector type \p VecT.
+/// \tparam [in] VecT The type of the vector
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The vectorized absolute difference of the two values
+template <typename VecT>
+inline unsigned vectorized_sum_abs_diff(unsigned a, unsigned b) {
+  sycl::vec<unsigned, 1> v0{a}, v1{b};
+  auto v2 = v0.as<VecT>();
+  auto v3 = v1.as<VecT>();
+  auto v4 = sycl::abs_diff(v2, v3);
+  unsigned sum = 0;
+  for (size_t i = 0; i < v4.size(); ++i) {
+    sum += v4[i];
+  }
+  return sum;
+}
+
+namespace detail {
+/// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed
+/// extend for signed int.
+template <typename T> inline auto zero_or_signed_extent(T val, unsigned bit) {
+  if constexpr (std::is_signed_v<T>) {
+    if constexpr (std::is_same_v<T, int32_t>) {
+      assert(bit < 64 &&
+             "When extend int32 value, bit must be smaller than 64.");
+      return int64_t(val) << (64 - bit) >> (64 - bit);
+    } else if constexpr (std::is_same_v<T, int16_t>) {
+      assert(bit < 32 &&
+             "When extend int16 value, bit must be smaller than 32.");
+      return int32_t(val) << (32 - bit) >> (32 - bit);
+    } else if constexpr (std::is_same_v<T, int8_t>) {
+      assert(bit < 16 &&
+             "When extend int8 value, bit must be smaller than 16.");
+      return int16_t(val) << (16 - bit) >> (16 - bit);
+    } else {
+      assert(bit < 64 && "Cannot extend int64 value.");
+      return val;
+    }
+  } else
+    return val;
+}
+
+template <typename RetT, bool NeedSat, typename AT, typename BT,
+          typename BinaryOperation>
+inline constexpr std::enable_if_t<
+    std::is_integral_v<AT> && std::is_integral_v<BT> &&
+        std::is_integral_v<RetT> && sizeof(AT) == 4 && sizeof(BT) == 4 &&
+        sizeof(RetT) == 4,
+    RetT>
+extend_binary(AT a, BT b, BinaryOperation binary_op) {
+  int64_t extend_a = zero_or_signed_extent(a, 33);
+  int64_t extend_b = zero_or_signed_extent(b, 33);
+  int64_t ret = binary_op(extend_a, extend_b);
+  if constexpr (NeedSat)
+    return dpct::clamp<int64_t>(ret, std::numeric_limits<RetT>::min(),
+                                std::numeric_limits<RetT>::max());
+  return ret;
+}
+
+template <typename RetT, bool NeedSat, typename AT, typename BT, typename CT,
+          typename BinaryOperation1, typename BinaryOperation2>
+inline constexpr std::enable_if_t<
+    std::is_integral_v<AT> && std::is_integral_v<BT> &&
+        std::is_integral_v<CT> && std::is_integral_v<RetT> && sizeof(AT) == 4 &&
+        sizeof(BT) == 4 && sizeof(CT) == 4 && sizeof(RetT) == 4,
+    RetT>
+extend_binary(AT a, BT b, CT c, BinaryOperation1 binary_op,
+              BinaryOperation2 second_op) {
+  int64_t extend_a = zero_or_signed_extent(a, 33);
+  int64_t extend_b = zero_or_signed_extent(b, 33);
+  int64_t extend_temp =
+      zero_or_signed_extent(binary_op(extend_a, extend_b), 34);
+  if constexpr (NeedSat)
+    extend_temp =
+        dpct::clamp<int64_t>(extend_temp, std::numeric_limits<RetT>::min(),
+                             std::numeric_limits<RetT>::max());
+  int64_t extend_c = zero_or_signed_extent(c, 33);
+  return second_op(extend_temp, extend_c);
+}
+
+template <typename T> sycl::vec<int32_t, 2> extractAndExtend2(T a) {
+  sycl::vec<int32_t, 2> ret;
+  sycl::vec<T, 1> va{a};
+  if constexpr (std::is_signed_v<T>) {
+    auto v = va.template as<sycl::vec<int16_t, 2>>();
+    ret[0] = zero_or_signed_extent(v[0], 17);
+    ret[1] = zero_or_signed_extent(v[1], 17);
+  } else {
+    auto v = va.template as<sycl::vec<uint16_t, 2>>();
+    ret[0] = zero_or_signed_extent(v[0], 17);
+    ret[1] = zero_or_signed_extent(v[1], 17);
+  }
+  return ret;
+}
+
+template <typename T> sycl::vec<int16_t, 4> extractAndExtend4(T a) {
+  sycl::vec<int16_t, 4> ret;
+  sycl::vec<T, 1> va{a};
+  if constexpr (std::is_signed_v<T>) {
+    auto v = va.template as<sycl::vec<int8_t, 4>>();
+    ret[0] = zero_or_signed_extent(v[0], 9);
+    ret[1] = zero_or_signed_extent(v[1], 9);
+    ret[2] = zero_or_signed_extent(v[2], 9);
+    ret[3] = zero_or_signed_extent(v[3], 9);
+  } else {
+    auto v = va.template as<sycl::vec<uint8_t, 4>>();
+    ret[0] = zero_or_signed_extent(v[0], 9);
+    ret[1] = zero_or_signed_extent(v[1], 9);
+    ret[2] = zero_or_signed_extent(v[2], 9);
+    ret[3] = zero_or_signed_extent(v[3], 9);
+  }
+  return ret;
+}
+
+template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
+          typename BinaryOperation>
+inline constexpr std::enable_if_t<
+    std::is_integral_v<AT> && std::is_integral_v<BT> &&
+        std::is_integral_v<RetT> && sizeof(AT) == 4 && sizeof(BT) == 4 &&
+        sizeof(RetT) == 4,
+    RetT>
+extend_vbinary2(AT a, BT b, RetT c, BinaryOperation binary_op) {
+  sycl::vec<int32_t, 2> extend_a = extractAndExtend2(a);
+  sycl::vec<int32_t, 2> extend_b = extractAndExtend2(b);
+  sycl::vec<int32_t, 2> temp{binary_op(extend_a[0], extend_b[0]),
+                             binary_op(extend_a[1], extend_b[1])};
+  if constexpr (NeedSat) {
+    int32_t min_val = 0, max_val = 0;
+    if constexpr (std::is_signed_v<RetT>) {
+      min_val = std::numeric_limits<int16_t>::min();
+      max_val = std::numeric_limits<int16_t>::max();
+    } else {
+      min_val = std::numeric_limits<uint16_t>::min();
+      max_val = std::numeric_limits<uint16_t>::max();
+    }
+    temp = dpct::clamp(temp, {min_val, min_val}, {max_val, max_val});
+  }
+  if constexpr (NeedAdd) {
+    return temp[0] + temp[1] + c;
+  }
+  if constexpr (std::is_signed_v<RetT>) {
+    return sycl::vec<int16_t, 2>{temp[0], temp[1]}.as<sycl::vec<RetT, 1>>();
+  } else {
+    return sycl::vec<uint16_t, 2>{temp[0], temp[1]}.as<sycl::vec<RetT, 1>>();
+  }
+}
+
+template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
+          typename BinaryOperation>
+inline constexpr std::enable_if_t<
+    std::is_integral_v<AT> && std::is_integral_v<BT> &&
+        std::is_integral_v<RetT> && sizeof(AT) == 4 && sizeof(BT) == 4 &&
+        sizeof(RetT) == 4,
+    RetT>
+extend_vbinary4(AT a, BT b, RetT c, BinaryOperation binary_op) {
+  sycl::vec<int16_t, 4> extend_a = extractAndExtend4(a);
+  sycl::vec<int16_t, 4> extend_b = extractAndExtend4(b);
+  sycl::vec<int16_t, 4> temp{
+      binary_op(extend_a[0], extend_b[0]), binary_op(extend_a[1], extend_b[1]),
+      binary_op(extend_a[2], extend_b[2]), binary_op(extend_a[3], extend_b[3])};
+  if constexpr (NeedSat) {
+    int16_t min_val = 0, max_val = 0;
+    if constexpr (std::is_signed_v<RetT>) {
+      min_val = std::numeric_limits<int8_t>::min();
+      max_val = std::numeric_limits<int8_t>::max();
+    } else {
+      min_val = std::numeric_limits<uint8_t>::min();
+      max_val = std::numeric_limits<uint8_t>::max();
+    }
+    temp = dpct::clamp(temp, {min_val, min_val, min_val, min_val},
+                       {max_val, max_val, max_val, max_val});
+  }
+  if constexpr (NeedAdd) {
+    return temp[0] + temp[1] + temp[2] + temp[3] + c;
+  }
+  if constexpr (std::is_signed_v<RetT>) {
+    return sycl::vec<int8_t, 4>{temp[0], temp[1], temp[2], temp[3]}
+        .as<sycl::vec<RetT, 1>>();
+  } else {
+    return sycl::vec<uint8_t, 4>{temp[0], temp[1], temp[2], temp[3]}
+        .as<sycl::vec<RetT, 1>>();
+  }
+}
+
+template <typename T1, typename T2>
+using dot_product_acc_t =
+    std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
+                       uint32_t, int32_t>;
+
+template <typename T> sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
+  return sycl::vec<T, 1>(val)
+      .template as<sycl::vec<
+          std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
+      .template convert<T>();
+}
+
+template <typename T> sycl::vec<T, 2> extract_and_sign_or_zero_extend2(T val) {
+  return sycl::vec<T, 1>(val)
+      .template as<sycl::vec<
+          std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>, 2>>()
+      .template convert<T>();
+}
+} // namespace detail
+
+/// Two-way dot product-accumulate. Calculate and return interger_vector2(
+/// \param a) dot product interger_vector2(low16_bit( \param b))  + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
+/// result.
+template <typename T1, typename T2, typename T3>
+inline auto dp2a_lo(T1 a, T2 b, T3 c) {
+  detail::dot_product_acc_t<T1, T2> res = c;
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
+  res = __dp2a_lo(a, b, c);
+#else
+  auto va = ::dpct::detail::extract_and_sign_or_zero_extend2(a);
+  auto vb = ::dpct::detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[0];
+  res += va[1] * vb[1];
+#endif
+  return res;
+}
+
+/// Two-way dot product-accumulate. Calculate and return interger_vector2(
+/// \param a) dot product interger_vector2(high_16bit( \param b)) + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
+/// result.
+template <typename T1, typename T2, typename T3>
+inline auto dp2a_hi(T1 a, T2 b, T3 c) {
+  detail::dot_product_acc_t<T1, T2> res = c;
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
+  res = __dp2a_hi(a, b, c);
+#else
+  auto va = ::dpct::detail::extract_and_sign_or_zero_extend2(a);
+  auto vb = ::dpct::detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[2];
+  res += va[1] * vb[3];
+#endif
+  return res;
+}
+
+/// Four-way byte dot product-accumulate. Calculate and return interger_vector4(
+/// \param a) dot product interger_vector4( \param b)  + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Four-way byte dot product which is accumulated in 32-bit result.
+template <typename T1, typename T2, typename T3>
+inline auto dp4a(T1 a, T2 b, T3 c) {
+  detail::dot_product_acc_t<T1, T2> res = c;
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
+  res = __dp4a(a, b, c);
+#else
+  auto va = ::dpct::detail::extract_and_sign_or_zero_extend4(a);
+  auto vb = ::dpct::detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[0];
+  res += va[1] * vb[1];
+  res += va[2] * vb[2];
+  res += va[3] * vb[3];
+#endif
+  return res;
+}
+
+/// Extend \p a and \p b to 33 bit and add them.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_add(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, std::plus());
+}
+
+/// Extend Inputs to 33 bit, add \p a, \p b, then do \p second_op with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend addition of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_add(AT a, BT b, CT c, BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, std::plus(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and add them with saturation.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_add_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, std::plus());
+}
+
+/// Extend Inputs to 33 bit, add \p a, \p b with saturation, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend addition of \p a, \p b with saturation and \p second_op
+/// with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_add_sat(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, std::plus(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and minus them.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_sub(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, std::minus());
+}
+
+/// Extend Inputs to 33 bit, minus \p a, \p b, then do \p second_op with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend subtraction of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_sub(AT a, BT b, CT c, BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, std::minus(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and minus them with saturation.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_sub_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, std::minus());
+}
+
+/// Extend Inputs to 33 bit, minus \p a, \p b with saturation, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend subtraction of \p a, \p b with saturation and \p
+/// second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_sub_sat(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, std::minus(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and do abs_diff.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_absdiff(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, abs_diff());
+}
+
+/// Extend Inputs to 33 bit, abs_diff \p a, \p b, then do \p second_op with \p
+/// c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend abs_diff of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_absdiff(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, abs_diff(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and do abs_diff with saturation.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The extend abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_absdiff_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, abs_diff());
+}
+
+/// Extend Inputs to 33 bit, abs_diff \p a, \p b with saturation, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The extend abs_diff of \p a, \p b with saturation and \p
+/// second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_absdiff_sat(AT a, BT b, CT c,
+                                         BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, abs_diff(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return smaller one.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The smaller one of the two extended values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_min(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, minimum());
+}
+
+/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The smaller one of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_min(AT a, BT b, CT c, BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, minimum(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return smaller one with saturation.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The smaller one of the two extended values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_min_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, minimum());
+}
+
+/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b with saturation,
+/// then do \p second_op with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The smaller one of \p a, \p b with saturation and \p
+/// second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_min_sat(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, minimum(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return bigger one.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The bigger one of the two extended values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_max(AT a, BT b) {
+  return detail::extend_binary<RetT, false>(a, b, maximum());
+}
+
+/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b, then do \p
+/// second_op with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The bigger one of \p a, \p b and \p second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_max(AT a, BT b, CT c, BinaryOperation second_op) {
+  return detail::extend_binary<RetT, false>(a, b, c, maximum(), second_op);
+}
+
+/// Extend \p a and \p b to 33 bit and return bigger one with saturation.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \returns The bigger one of the two extended values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_max_sat(AT a, BT b) {
+  return detail::extend_binary<RetT, true>(a, b, maximum());
+}
+
+/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b with saturation,
+/// then do \p second_op with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] CT The type of the third value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the second operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] second_op The operation to do with the third value
+/// \returns The bigger one of \p a, \p b with saturation and \p
+/// second_op with \p c
+template <typename RetT, typename AT, typename BT, typename CT,
+          typename BinaryOperation>
+inline constexpr RetT extend_max_sat(AT a, BT b, CT c,
+                                     BinaryOperation second_op) {
+  return detail::extend_binary<RetT, true>(a, b, c, maximum(), second_op);
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 4 elements vector type and extend each element to 9 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, maximum());
+}
+} // namespace dpct
+
+#endif // __DPCT_MATH_HPP__
diff --git a/dpct/memory.hpp b/dpct/memory.hpp
new file mode 100644
index 0000000000000..88afe77fafcf3
--- /dev/null
+++ b/dpct/memory.hpp
@@ -0,0 +1,1497 @@
+//==---- memory.hpp -------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_MEMORY_HPP__
+#define __DPCT_MEMORY_HPP__
+
+#include "device.hpp"
+#include <sycl/sycl.hpp>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <unordered_map>
+#include <map>
+#include <utility>
+#include <thread>
+#include <type_traits>
+
+#if defined(__linux__)
+#include <sys/mman.h>
+#elif defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#error "Only support Windows and Linux."
+#endif
+
+namespace dpct {
+
+enum memcpy_direction {
+  host_to_host,
+  host_to_device,
+  device_to_host,
+  device_to_device,
+  automatic
+};
+enum memory_region {
+  global = 0,  // device global memory
+  constant,    // device constant memory
+  local,       // device local memory
+  shared,      // memory which can be accessed by host and device
+};
+
+typedef uint8_t byte_t;
+
+/// Buffer type to be used in Memory Management runtime.
+typedef sycl::buffer<byte_t> buffer_t;
+
+/// Pitched 2D/3D memory data.
+class pitched_data {
+public:
+  pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
+  pitched_data(void *data, size_t pitch, size_t x, size_t y)
+      : _data(data), _pitch(pitch), _x(x), _y(y) {}
+
+  void *get_data_ptr() { return _data; }
+  void set_data_ptr(void *data) { _data = data; }
+
+  size_t get_pitch() { return _pitch; }
+  void set_pitch(size_t pitch) { _pitch = pitch; }
+
+  size_t get_x() { return _x; }
+  void set_x(size_t x) { _x = x; };
+
+  size_t get_y() { return _y; }
+  void set_y(size_t y) { _y = y; }
+
+private:
+  void *_data;
+  size_t _pitch, _x, _y;
+};
+
+namespace detail {
+class mem_mgr {
+  mem_mgr() {
+    // Reserved address space, no real memory allocation happens here.
+#if defined(__linux__)
+    mapped_address_space =
+        (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
+                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#elif defined(_WIN64)
+    mapped_address_space = (byte_t *)VirtualAlloc(
+        NULL,               // NULL specified as the base address parameter
+        mapped_region_size, // Size of allocation
+        MEM_RESERVE,        // Allocate reserved pages
+        PAGE_NOACCESS);     // Protection = no access
+#else
+#error "Only support Windows and Linux."
+#endif
+    next_free = mapped_address_space;
+  };
+
+public:
+  using buffer_id_t = int;
+
+  struct allocation {
+    buffer_t buffer;
+    byte_t *alloc_ptr;
+    size_t size;
+  };
+
+  ~mem_mgr() {
+#if defined(__linux__)
+    munmap(mapped_address_space, mapped_region_size);
+#elif defined(_WIN64)
+    VirtualFree(mapped_address_space, 0, MEM_RELEASE);
+#else
+#error "Only support Windows and Linux."
+#endif
+  };
+
+  mem_mgr(const mem_mgr &) = delete;
+  mem_mgr &operator=(const mem_mgr &) = delete;
+  mem_mgr(mem_mgr &&) = delete;
+  mem_mgr &operator=(mem_mgr &&) = delete;
+
+  /// Allocate
+  void *mem_alloc(size_t size) {
+    if (!size)
+      return nullptr;
+    std::lock_guard<std::mutex> lock(m_mutex);
+    if (next_free + size > mapped_address_space + mapped_region_size) {
+      throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
+    }
+    // Allocation
+    sycl::range<1> r(size);
+    buffer_t buf(r);
+    allocation A{buf, next_free, size};
+    // Map allocation to device pointer
+    void *result = next_free;
+    m_map.emplace(next_free + size, A);
+    // Update pointer to the next free space.
+    next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
+
+    return result;
+  }
+
+  /// Deallocate
+  void mem_free(const void *ptr) {
+    if (!ptr)
+      return;
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto it = get_map_iterator(ptr);
+    m_map.erase(it);
+  }
+
+  /// map: device pointer -> allocation(buffer, alloc_ptr, size)
+  allocation translate_ptr(const void *ptr) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto it = get_map_iterator(ptr);
+    return it->second;
+  }
+
+  /// Check if the pointer represents device pointer or not.
+  bool is_device_ptr(const void *ptr) const {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    return (mapped_address_space <= ptr) &&
+           (ptr < mapped_address_space + mapped_region_size);
+  }
+
+  /// Returns the instance of memory manager singleton.
+  static mem_mgr &instance() {
+    static mem_mgr m;
+    return m;
+  }
+
+private:
+  std::map<byte_t *, allocation> m_map;
+  mutable std::mutex m_mutex;
+  byte_t *mapped_address_space;
+  byte_t *next_free;
+  const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
+  const size_t alignment = 256;
+  /// This padding may be defined to some positive value to debug
+  /// out of bound accesses.
+  const size_t extra_padding = 0;
+
+  std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr) {
+    auto it = m_map.upper_bound((byte_t *)ptr);
+    if (it == m_map.end()) {
+      // Not a virtual pointer.
+      throw std::runtime_error("can not get buffer from non-virtual pointer");
+    }
+    const allocation &alloc = it->second;
+    if (ptr < alloc.alloc_ptr) {
+      // Out of bound.
+      // This may happen if there's a gap between allocations due to alignment
+      // or extra padding and pointer points to this gap.
+      throw std::runtime_error("invalid virtual pointer");
+    }
+    return it;
+  }
+};
+
+template <class T, memory_region Memory, size_t Dimension> class accessor;
+template <memory_region Memory, class T = byte_t> class memory_traits {
+public:
+  static constexpr sycl::access::target target =
+      sycl::access::target::device;
+  static constexpr sycl::access_mode mode =
+      (Memory == constant) ? sycl::access_mode::read
+                           : sycl::access_mode::read_write;
+  static constexpr size_t type_size = sizeof(T);
+  using element_t =
+      typename std::conditional<Memory == constant, const T, T>::type;
+  using value_t = typename std::remove_cv<T>::type;
+  template <size_t Dimension = 1>
+  using accessor_t = typename std::conditional<
+      Memory == local, sycl::local_accessor<value_t, Dimension>,
+      sycl::accessor<T, Dimension, mode, target>>::type;
+  using pointer_t = T *;
+};
+
+static inline void *dpct_malloc(size_t size, sycl::queue &q) {
+#ifdef DPCT_USM_LEVEL_NONE
+  return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
+#else
+  return sycl::malloc_device(size, q.get_device(), q.get_context());
+#endif // DPCT_USM_LEVEL_NONE
+}
+
+#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
+static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
+                                sycl::queue &q) {
+  pitch = PITCH_DEFAULT_ALIGN(x);
+  return dpct_malloc(pitch * y * z, q);
+}
+
+/**
+ * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
+ * @tparam valueT The type of the element to be set.
+ * @param [in] q The queue in which the operation is done.
+ * @param [in] dev_ptr Pointer to the virtual device memory address.
+ * @param [in] value The value to be set.
+ * @param [in] size Number of elements to be set to the value.
+ * @return An event representing the memset operation.
+ */
+template <typename valueT>
+static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
+                                      valueT value, size_t size) {
+#ifdef DPCT_USM_LEVEL_NONE
+  auto &mm = mem_mgr::instance();
+  assert(mm.is_device_ptr(dev_ptr));
+  auto alloc = mm.translate_ptr(dev_ptr);
+  size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
+
+  return q.submit([&](sycl::handler &cgh) {
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    auto new_buffer = alloc.buffer.reinterpret<valueT>(
+        sycl::range<1>(alloc.size / sizeof(valueT)));
+    sycl::accessor<valueT, 1, sycl::access_mode::write,
+                   sycl::access::target::device>
+        acc(new_buffer, cgh, r, o);
+    cgh.fill(acc, value);
+  });
+#else
+  return q.fill(dev_ptr, value, size);
+#endif // DPCT_USM_LEVEL_NONE
+}
+
+/**
+ * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
+ * @tparam valueT The type of the element to be set.
+ * @param [in] q The queue in which the operation is done.
+ * @param [in] data Pointer to the pitched device memory region.
+ * @param [in] value The value to be set.
+ * @param [in] size 3D memory region by number of elements.
+ * @return An event list representing the memset operations.
+ */
+template<typename valueT>
+static inline std::vector<sycl::event>
+dpct_memset(sycl::queue &q, pitched_data data, valueT value,
+            sycl::range<3> size) {
+  std::vector<sycl::event> event_list;
+  size_t slice = data.get_pitch() * data.get_y();
+  unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
+  for (size_t z = 0; z < size.get(2); ++z) {
+    unsigned char *data_ptr = data_surface;
+    for (size_t y = 0; y < size.get(1); ++y) {
+      event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
+      data_ptr += data.get_pitch();
+    }
+    data_surface += slice;
+  }
+  return event_list;
+}
+
+/**
+ * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
+ * @tparam valueT The type of the element to be set.
+ * @param [in] q The queue in which the operation is done.
+ * @param [in] ptr Pointer to the virtual device memory.
+ * @param [in] pitch The pitch size by number of elements, including padding.
+ * @param [in] val The value to be set.
+ * @param [in] x The width of memory region by number of elements.
+ * @param [in] y The height of memory region by number of elements.
+ * @return An event list representing the memset operations.
+ */
+template<typename valueT>
+static inline std::vector<sycl::event>
+dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
+            size_t y) {
+  return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
+                     sycl::range<3>(x, y, 1));
+}
+
+enum class pointer_access_attribute {
+  host_only = 0,
+  device_only,
+  host_device,
+  end
+};
+
+static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
+                                                      const void *ptr) {
+#ifdef DPCT_USM_LEVEL_NONE
+  return mem_mgr::instance().is_device_ptr(ptr)
+             ? pointer_access_attribute::device_only
+             : pointer_access_attribute::host_only;
+#else
+  switch (sycl::get_pointer_type(ptr, q.get_context())) {
+  case sycl::usm::alloc::unknown:
+    return pointer_access_attribute::host_only;
+  case sycl::usm::alloc::device:
+    return pointer_access_attribute::device_only;
+  case sycl::usm::alloc::shared:
+  case sycl::usm::alloc::host:
+    return pointer_access_attribute::host_device;
+  }
+#endif
+}
+
+static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
+                                             const void *from_ptr,
+                                             memcpy_direction dir) {
+  switch (dir) {
+  case memcpy_direction::host_to_host:
+  case memcpy_direction::host_to_device:
+  case memcpy_direction::device_to_host:
+  case memcpy_direction::device_to_device:
+    return dir;
+  case memcpy_direction::automatic: {
+    // table[to_attribute][from_attribute]
+    static const memcpy_direction
+        direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
+                       [static_cast<unsigned>(pointer_access_attribute::end)] =
+                           {{memcpy_direction::host_to_host,
+                             memcpy_direction::device_to_host,
+                             memcpy_direction::host_to_host},
+                            {memcpy_direction::host_to_device,
+                             memcpy_direction::device_to_device,
+                             memcpy_direction::device_to_device},
+                            {memcpy_direction::host_to_host,
+                             memcpy_direction::device_to_device,
+                             memcpy_direction::device_to_device}};
+    return direction_table[static_cast<unsigned>(get_pointer_attribute(
+        q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
+  }
+  default:
+    throw std::runtime_error("dpct_memcpy: invalid direction value");
+  }
+}
+
+static sycl::event
+dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+            memcpy_direction direction,
+            const std::vector<sycl::event> &dep_events = {}) {
+  if (!size)
+    return sycl::event{};
+#ifdef DPCT_USM_LEVEL_NONE
+  auto &mm = mem_mgr::instance();
+  auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+
+  switch (real_direction) {
+  case host_to_host:
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); });
+    });
+  case host_to_device: {
+    auto alloc = mm.translate_ptr(to_ptr);
+    size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      auto r = sycl::range<1>(size);
+      auto o = sycl::id<1>(offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                          sycl::access::target::device>
+          acc(alloc.buffer, cgh, r, o);
+      cgh.copy(from_ptr, acc);
+    });
+  }
+  case device_to_host: {
+    auto alloc = mm.translate_ptr(from_ptr);
+    size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      auto r = sycl::range<1>(size);
+      auto o = sycl::id<1>(offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                          sycl::access::target::device>
+          acc(alloc.buffer, cgh, r, o);
+      cgh.copy(acc, to_ptr);
+    });
+  }
+  case device_to_device: {
+    auto to_alloc = mm.translate_ptr(to_ptr);
+    auto from_alloc = mm.translate_ptr(from_ptr);
+    size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
+    size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
+    return q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      auto r = sycl::range<1>(size);
+      auto to_o = sycl::id<1>(to_offset);
+      auto from_o = sycl::id<1>(from_offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                          sycl::access::target::device>
+          to_acc(to_alloc.buffer, cgh, r, to_o);
+      sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                          sycl::access::target::device>
+          from_acc(from_alloc.buffer, cgh, r, from_o);
+      cgh.copy(from_acc, to_acc);
+    });
+  }
+  default:
+    throw std::runtime_error("dpct_memcpy: invalid direction value");
+  }
+#else
+  return q.memcpy(to_ptr, from_ptr, size, dep_events);
+#endif // DPCT_USM_LEVEL_NONE
+}
+
+// Get actual copy range and make sure it will not exceed range.
+static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                    size_t pitch) {
+  return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+}
+
+static inline size_t get_offset(sycl::id<3> id, size_t slice,
+                                    size_t pitch) {
+  return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+}
+
+/// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+/// and \p from_range to another specified by \p to_ptr and \p to_range.
+static inline std::vector<sycl::event>
+dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+            sycl::range<3> to_range, sycl::range<3> from_range,
+            sycl::id<3> to_id, sycl::id<3> from_id,
+            sycl::range<3> size, memcpy_direction direction,
+            const std::vector<sycl::event> &dep_events = {}) {
+  // RAII for host pointer
+  class host_buffer {
+    void *_buf;
+    size_t _size;
+    sycl::queue &_q;
+    const std::vector<sycl::event> &_deps; // free operation depends
+
+  public:
+    host_buffer(size_t size, sycl::queue &q,
+                const std::vector<sycl::event> &deps)
+        : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+    void *get_ptr() const { return _buf; }
+    size_t get_size() const { return _size; }
+    ~host_buffer() {
+      if (_buf) {
+        _q.submit([&](sycl::handler &cgh) {
+          cgh.depends_on(_deps);
+          cgh.host_task([buf = _buf] { std::free(buf); });
+        });
+      }
+    }
+  };
+  std::vector<sycl::event> event_list;
+
+  size_t to_slice = to_range.get(1) * to_range.get(0),
+         from_slice = from_range.get(1) * from_range.get(0);
+  unsigned char *to_surface =
+      (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+  const unsigned char *from_surface =
+      (const unsigned char *)from_ptr +
+      get_offset(from_id, from_slice, from_range.get(0));
+
+  if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) {
+    return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                        direction, dep_events)};
+  }
+  direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+  size_t size_slice = size.get(1) * size.get(0);
+  switch (direction) {
+  case host_to_host:
+    for (size_t z = 0; z < size.get(2); ++z) {
+      unsigned char *to_ptr = to_surface;
+      const unsigned char *from_ptr = from_surface;
+      if (to_range.get(0) == from_range.get(0) &&
+          to_range.get(0) == size.get(0)) {
+        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
+                                         direction, dep_events));
+      } else {
+        for (size_t y = 0; y < size.get(1); ++y) {
+          event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
+                                           direction, dep_events));
+          to_ptr += to_range.get(0);
+          from_ptr += from_range.get(0);
+        }
+      }
+      to_surface += to_slice;
+      from_surface += from_slice;
+    }
+    break;
+  case host_to_device: {
+    host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                    event_list);
+    std::vector<sycl::event> host_events;
+    if (to_slice == size_slice) {
+      // Copy host data to a temp host buffer with the shape of target.
+      host_events =
+          dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                      sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                      host_to_host, dep_events);
+    } else {
+      // Copy host data to a temp host buffer with the shape of target.
+      host_events = dpct_memcpy(
+          q, buf.get_ptr(), from_surface, to_range, from_range,
+          sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
+          // If has padding data, not sure whether it is useless. So fill temp
+          // buffer with it.
+          std::vector<sycl::event>{
+              dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                          device_to_host, dep_events)});
+    }
+    // Copy from temp host buffer to device with only one submit.
+    event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
+                                     buf.get_size(), host_to_device,
+                                     host_events));
+    break;
+  }
+  case device_to_host: {
+    host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                    event_list);
+    // Copy from host temp buffer to host target with reshaping.
+    event_list = dpct_memcpy(
+        q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
+        sycl::id<3>(0, 0, 0), size, host_to_host,
+        // Copy from device to temp host buffer with only one submit.
+        std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
+                                                 buf.get_size(),
+                                                 device_to_host, dep_events)});
+    break;
+  }
+  case device_to_device:
+#ifdef DPCT_USM_LEVEL_NONE
+  {
+    auto &mm = mem_mgr::instance();
+    auto to_alloc = mm.translate_ptr(to_surface);
+    auto from_alloc = mm.translate_ptr(from_surface);
+    size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
+    size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
+    event_list.push_back(q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      auto to_o = sycl::id<1>(to_offset);
+      auto from_o = sycl::id<1>(from_offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                         sycl::access::target::device>
+          to_acc(to_alloc.buffer, cgh,
+                 get_copy_range(size, to_slice, to_range.get(0)), to_o);
+      sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                         sycl::access::target::device>
+          from_acc(from_alloc.buffer, cgh,
+                   get_copy_range(size, from_slice, from_range.get(0)), from_o);
+      cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
+          size,
+          [=](sycl::id<3> id) {
+            to_acc[get_offset(id, to_slice, to_range.get(0))] =
+                from_acc[get_offset(id, from_slice, from_range.get(0))];
+          });
+    }));
+  }
+#else
+    event_list.push_back(q.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(dep_events);
+      cgh.parallel_for<class dpct_memcpy_3d_detail>(
+          size,
+          [=](sycl::id<3> id) {
+            to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                from_surface[get_offset(id, from_slice, from_range.get(0))];
+          });
+    }));
+#endif
+  break;
+  default:
+    throw std::runtime_error("dpct_memcpy: invalid direction value");
+  }
+  return event_list;
+}
+
+/// memcpy 2D/3D matrix specified by pitched_data.
+static inline std::vector<sycl::event>
+dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+            pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+            memcpy_direction direction = automatic) {
+  return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                     sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                     sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                     size, direction);
+}
+
+/// memcpy 2D matrix with pitch.
+static inline std::vector<sycl::event>
+dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+            size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+            memcpy_direction direction = automatic) {
+  return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                     sycl::range<3>(from_pitch, y, 1),
+                     sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                     sycl::range<3>(x, y, 1), direction);
+}
+
+namespace deprecated {
+
+template <typename T, sycl::usm::alloc AllocKind>
+class usm_allocator {
+private:
+  using Alloc = sycl::usm_allocator<T, AllocKind>;
+  Alloc _impl;
+
+public:
+  using value_type = typename std::allocator_traits<Alloc>::value_type;
+  using pointer = typename std::allocator_traits<Alloc>::pointer;
+  using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
+  using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
+  using const_void_pointer =
+      typename std::allocator_traits<Alloc>::const_void_pointer;
+  using reference = typename std::allocator_traits<Alloc>::value_type &;
+  using const_reference =
+      const typename std::allocator_traits<Alloc>::value_type &;
+  using difference_type =
+      typename std::allocator_traits<Alloc>::difference_type;
+  using size_type = typename std::allocator_traits<Alloc>::size_type;
+  using propagate_on_container_copy_assignment = typename std::allocator_traits<
+      Alloc>::propagate_on_container_copy_assignment;
+  using propagate_on_container_move_assignment = typename std::allocator_traits<
+      Alloc>::propagate_on_container_move_assignment;
+  using propagate_on_container_swap =
+      typename std::allocator_traits<Alloc>::propagate_on_container_swap;
+  using is_always_equal =
+      typename std::allocator_traits<Alloc>::is_always_equal;
+
+  template <typename U> struct rebind {
+    typedef usm_allocator<U, AllocKind> other;
+  };
+
+  usm_allocator() : _impl(dpct::get_default_queue()) {}
+  ~usm_allocator() {}
+  usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
+  usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
+  pointer address(reference r) { return &r; }
+  const_pointer address(const_reference r) { return &r; }
+  pointer allocate(size_type cnt, const_void_pointer hint = nullptr) {
+    return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
+  }
+  void deallocate(pointer p, size_type cnt) {
+    std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
+  }
+  size_type max_size() const {
+    return std::allocator_traits<Alloc>::max_size(_impl);
+  }
+  bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
+  bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
+};
+
+} // namespace deprecated
+
+inline void dpct_free(void *ptr,
+                      const sycl::queue &q) {
+  if (ptr) {
+#ifdef DPCT_USM_LEVEL_NONE
+    detail::mem_mgr::instance().mem_free(ptr);
+#else
+    sycl::free(ptr, q.get_context());
+#endif // DPCT_USM_LEVEL_NONE
+  }
+}
+} // namespace detail
+
+#ifdef DPCT_USM_LEVEL_NONE
+/// Check if the pointer \p ptr represents device pointer or not.
+///
+/// \param ptr The pointer to be checked.
+/// \returns true if \p ptr is a device pointer.
+template<class T>
+static inline bool is_device_ptr(T ptr) {
+  if constexpr (std::is_pointer<T>::value) {
+    return detail::mem_mgr::instance().is_device_ptr(ptr);
+  }
+  return false;
+}
+#endif
+
+/// Get the buffer and the offset of a piece of memory pointed to by \p ptr.
+///
+/// \param ptr Pointer to a piece of memory.
+/// If NULL is passed as an argument, an exception will be thrown.
+/// \returns a pair containing both the buffer and the offset.
+static std::pair<buffer_t, size_t> get_buffer_and_offset(const void *ptr) {
+  if (ptr) {
+    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+    size_t offset = (byte_t *)ptr - alloc.alloc_ptr;
+    return std::make_pair(alloc.buffer, offset);
+  } else {
+    throw std::runtime_error(
+        "NULL pointer argument in get_buffer_and_offset function is invalid");
+  }
+}
+
+/// Get the data pointed from \p ptr as a 1D buffer reinterpreted as type T.
+template <typename T> static sycl::buffer<T> get_buffer(const void *ptr) {
+  if (!ptr)
+    return sycl::buffer<T>(sycl::range<1>(0));
+  auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+  return alloc.buffer.reinterpret<T>(
+      sycl::range<1>(alloc.size / sizeof(T)));
+}
+
+/// Get the buffer of a piece of memory pointed to by \p ptr.
+///
+/// \param ptr Pointer to a piece of memory.
+/// \returns the buffer.
+static buffer_t get_buffer(const void *ptr) {
+  return detail::mem_mgr::instance().translate_ptr(ptr).buffer;
+}
+
+/// A wrapper class contains an accessor and an offset.
+template <typename dataT,
+          sycl::access_mode accessMode = sycl::access_mode::read_write>
+class access_wrapper {
+  sycl::accessor<byte_t, 1, accessMode> accessor;
+  size_t offset;
+
+public:
+  /// Construct the accessor wrapper for memory pointed by \p ptr.
+  ///
+  /// \param ptr Pointer to memory.
+  /// \param cgh The command group handler.
+  access_wrapper(const void *ptr, sycl::handler &cgh)
+      : accessor(get_buffer(ptr).get_access<accessMode>(cgh)), offset(0) {
+    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+    offset = (byte_t *)ptr - alloc.alloc_ptr;
+  }
+
+  /// Get the device pointer.
+  ///
+  /// \returns a device pointer with offset.
+  dataT get_raw_pointer() const { return (dataT)(&accessor[0] + offset); }
+};
+
+/// Get the accessor for memory pointed by \p ptr.
+///
+/// \param ptr Pointer to memory.
+/// If NULL is passed as an argument, an exception will be thrown.
+/// \param cgh The command group handler.
+/// \returns an accessor.
+template <sycl::access_mode accessMode = sycl::access_mode::read_write>
+static sycl::accessor<byte_t, 1, accessMode>
+get_access(const void *ptr, sycl::handler &cgh) {
+  if (ptr) {
+    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
+    return alloc.buffer.get_access<accessMode>(cgh);
+  } else {
+    throw std::runtime_error(
+        "NULL pointer argument in get_access function is invalid");
+  }
+}
+
+/// Allocate memory block on the device.
+/// \param num_bytes Number of bytes to allocate.
+/// \param q Queue to execute the allocate task.
+/// \returns A pointer to the newly allocated memory.
+template <typename T>
+static inline void *dpct_malloc(T num_bytes,
+                                sycl::queue &q = get_default_queue()) {
+  return detail::dpct_malloc(static_cast<size_t>(num_bytes), q);
+}
+
+/// Get the host pointer from a buffer that is mapped to virtual pointer ptr.
+/// \param ptr Virtual Pointer mapped to device buffer
+/// \returns A host pointer
+template <typename T> static inline T *get_host_ptr(const void *ptr) {
+  auto BufferOffset = get_buffer_and_offset(ptr);
+  auto host_ptr =
+      BufferOffset.first.get_host_access()
+          .get_pointer();
+  return (T *)(host_ptr + BufferOffset.second);
+}
+
+/// Allocate memory block for 3D array on the device.
+/// \param size Size of the memory block, in bytes.
+/// \param q Queue to execute the allocate task.
+/// \returns A pitched_data object which stores the memory info.
+static inline pitched_data
+dpct_malloc(sycl::range<3> size, sycl::queue &q = get_default_queue()) {
+  pitched_data pitch(nullptr, 0, size.get(0), size.get(1));
+  size_t pitch_size;
+  pitch.set_data_ptr(detail::dpct_malloc(pitch_size, size.get(0), size.get(1),
+                                         size.get(2), q));
+  pitch.set_pitch(pitch_size);
+  return pitch;
+}
+
+/// Allocate memory block for 2D array on the device.
+/// \param [out] pitch Aligned size of x in bytes.
+/// \param x Range in dim x.
+/// \param y Range in dim y.
+/// \param q Queue to execute the allocate task.
+/// \returns A pointer to the newly allocated memory.
+static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y,
+                                sycl::queue &q = get_default_queue()) {
+  return detail::dpct_malloc(pitch, x, y, 1, q);
+}
+
+/// free
+/// \param ptr Point to free.
+/// \param q Queue to execute the free task.
+/// \returns no return value.
+static inline void dpct_free(void *ptr,
+                             sycl::queue &q = get_default_queue()) {
+  detail::dpct_free(ptr, q);
+}
+
+/// Free the device memory pointed by a batch of pointers in \p pointers which
+/// are related to \p q after \p events completed.
+///
+/// \param pointers The pointers point to the device memory requested to be freed.
+/// \param events The events to be waited.
+/// \param q The sycl::queue the memory relates to.
+inline void async_dpct_free(const std::vector<void *> &pointers,
+                            const std::vector<sycl::event> &events,
+                            sycl::queue &q = get_default_queue()) {
+  q.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(events);
+    cgh.host_task([=] {
+      for (auto p : pointers)
+        if (p) {
+          detail::dpct_free(p, q);
+        }
+    });
+  });
+}
+
+/// Synchronously copies \p size bytes from the address specified by \p from_ptr
+/// to the address specified by \p to_ptr. The value of \p direction is used to
+/// set the copy direction, it can be \a host_to_host, \a host_to_device,
+/// \a device_to_host, \a device_to_device or \a automatic. The function will
+/// return after the copy is completed.
+///
+/// \param to_ptr Pointer to destination memory address.
+/// \param from_ptr Pointer to source memory address.
+/// \param size Number of bytes to be copied.
+/// \param direction Direction of the copy.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static void dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
+                        memcpy_direction direction = automatic,
+                        sycl::queue &q = get_default_queue()) {
+  detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction).wait();
+}
+
+/// Asynchronously copies \p size bytes from the address specified by \p
+/// from_ptr to the address specified by \p to_ptr. The value of \p direction is
+/// used to set the copy direction, it can be \a host_to_host, \a
+/// host_to_device, \a device_to_host, \a device_to_device or \a automatic. The
+/// return of the function does NOT guarantee the copy is completed.
+///
+/// \param to_ptr Pointer to destination memory address.
+/// \param from_ptr Pointer to source memory address.
+/// \param size Number of bytes to be copied.
+/// \param direction Direction of the copy.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
+                              memcpy_direction direction = automatic,
+                              sycl::queue &q = dpct::get_default_queue()) {
+  detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
+}
+
+/// Synchronously copies 2D matrix specified by \p x and \p y from the address
+/// specified by \p from_ptr to the address specified by \p to_ptr, while \p
+/// from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
+/// specified by \p from_ptr and \p to_ptr. The value of \p direction is used to
+/// set the copy direction, it can be \a host_to_host, \a host_to_device, \a
+/// device_to_host, \a device_to_device or \a automatic. The function will
+/// return after the copy is completed.
+///
+/// \param to_ptr Pointer to destination memory address.
+/// \param to_pitch Range of dim x in bytes of destination matrix.
+/// \param from_ptr Pointer to source memory address.
+/// \param from_pitch Range of dim x in bytes of source matrix.
+/// \param x Range of dim x of matrix to be copied.
+/// \param y Range of dim y of matrix to be copied.
+/// \param direction Direction of the copy.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void dpct_memcpy(void *to_ptr, size_t to_pitch,
+                               const void *from_ptr, size_t from_pitch,
+                               size_t x, size_t y,
+                               memcpy_direction direction = automatic,
+                               sycl::queue &q = dpct::get_default_queue()) {
+  sycl::event::wait(detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch,
+                                            from_pitch, x, y, direction));
+}
+
+/// Asynchronously copies 2D matrix specified by \p x and \p y from the address
+/// specified by \p from_ptr to the address specified by \p to_ptr, while \p
+/// \p from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
+/// specified by \p from_ptr and \p to_ptr. The value of \p direction is used to
+/// set the copy direction, it can be \a host_to_host, \a host_to_device, \a
+/// device_to_host, \a device_to_device or \a automatic. The return of the
+/// function does NOT guarantee the copy is completed.
+///
+/// \param to_ptr Pointer to destination memory address.
+/// \param to_pitch Range of dim x in bytes of destination matrix.
+/// \param from_ptr Pointer to source memory address.
+/// \param from_pitch Range of dim x in bytes of source matrix.
+/// \param x Range of dim x of matrix to be copied.
+/// \param y Range of dim y of matrix to be copied.
+/// \param direction Direction of the copy.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void
+async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
+                  size_t from_pitch, size_t x, size_t y,
+                  memcpy_direction direction = automatic,
+                  sycl::queue &q = get_default_queue()) {
+  detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
+                      direction);
+}
+
+/// Synchronously copies a subset of a 3D matrix specified by \p to to another
+/// 3D matrix specified by \p from. The from and to position info are specified
+/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size.
+/// The value of \p direction is used to set the copy direction, it can be \a
+/// host_to_host, \a host_to_device, \a device_to_host, \a device_to_device or
+/// \a automatic. The function will return after the copy is completed.
+///
+/// \param to Destination matrix info.
+/// \param to_pos Position of destination.
+/// \param from Source matrix info.
+/// \param from_pos Position of destination.
+/// \param size Range of the submatrix to be copied.
+/// \param direction Direction of the copy.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void dpct_memcpy(pitched_data to, sycl::id<3> to_pos,
+                               pitched_data from, sycl::id<3> from_pos,
+                               sycl::range<3> size,
+                               memcpy_direction direction = automatic,
+                               sycl::queue &q = dpct::get_default_queue()) {
+  sycl::event::wait(
+      detail::dpct_memcpy(q, to, to_pos, from, from_pos, size, direction));
+}
+
+/// Asynchronously copies a subset of a 3D matrix specified by \p to to another
+/// 3D matrix specified by \p from. The from and to position info are specified
+/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size.
+/// The value of \p direction is used to set the copy direction, it can be \a
+/// host_to_host, \a host_to_device, \a device_to_host, \a device_to_device or
+/// \a automatic. The return of the function does NOT guarantee the copy is
+/// completed.
+///
+/// \param to Destination matrix info.
+/// \param to_pos Position of destination.
+/// \param from Source matrix info.
+/// \param from_pos Position of destination.
+/// \param size Range of the submatrix to be copied.
+/// \param direction Direction of the copy.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void
+async_dpct_memcpy(pitched_data to, sycl::id<3> to_pos, pitched_data from,
+                  sycl::id<3> from_pos, sycl::range<3> size,
+                  memcpy_direction direction = automatic,
+                  sycl::queue &q = get_default_queue()) {
+  detail::dpct_memcpy(q, to, to_pos, from, from_pos, size, direction);
+}
+/**
+ * @brief Sets 1 byte data \p value to the first \p size elements starting from
+ * \p dev_ptr in \p q synchronously.
+ * @param [in] dev_ptr Pointer to the virtual device memory address.
+ * @param [in] value The value to be set.
+ * @param [in] size Number of elements to be set to the value.
+ * @param [in] q The queue in which the operation is done.
+ */
+static void dpct_memset(void *dev_ptr, int value, size_t size,
+                        sycl::queue &q = get_default_queue()) {
+  detail::dpct_memset<unsigned char>(q, dev_ptr, value, size).wait();
+}
+
+/**
+ * @brief Sets 2 bytes data \p value to the first \p size elements starting from
+ * \p dev_ptr in \p q synchronously.
+ * @param [in] dev_ptr Pointer to the virtual device memory address.
+ * @param [in] value The value to be set.
+ * @param [in] size Number of elements to be set to the value.
+ * @param [in] q The queue in which the operation is done.
+ */
+static void dpct_memset_d16(void *dev_ptr, unsigned short value, size_t size,
+                        sycl::queue &q = get_default_queue()) {
+  detail::dpct_memset(q, dev_ptr, value, size).wait();
+}
+/**
+ * @brief Sets 4 bytes data \p value to the first \p size elements starting from
+ * \p dev_ptr in \p q synchronously.
+ * @param [in] dev_ptr Pointer to the virtual device memory address.
+ * @param [in] value The value to be set.
+ * @param [in] size Number of elements to be set to the value.
+ * @param [in] q The queue in which the operation is done.
+ */
+static void dpct_memset_d32(void *dev_ptr, unsigned int value, size_t size,
+                        sycl::queue &q = get_default_queue()) {
+  detail::dpct_memset(q, dev_ptr, value, size).wait();
+}
+
+/**
+ * @brief Sets 1 byte data \p value to the first \p size elements starting from
+ * \p dev_ptr in \p q asynchronously.
+ * @param [in] dev_ptr Pointer to the virtual device memory address.
+ * @param [in] value The value to be set.
+ * @param [in] size Number of elements to be set to the value.
+ * @param [in] q The queue in which the operation is done.
+ */
+static void async_dpct_memset(void *dev_ptr, int value, size_t size,
+                              sycl::queue &q = dpct::get_default_queue()) {
+  detail::dpct_memset<unsigned char>(q, dev_ptr, value, size);
+}
+/**
+ * @brief Sets 2 bytes data \p value to the first \p size elements starting from
+ * \p dev_ptr in \p q asynchronously.
+ * @param [in] dev_ptr Pointer to the virtual device memory address.
+ * @param [in] value The value to be set.
+ * @param [in] size Number of elements to be set to the value.
+ * @param [in] q The queue in which the operation is done.
+ */
+static void async_dpct_memset_d16(void *dev_ptr, unsigned short value, size_t size,
+                              sycl::queue &q = dpct::get_default_queue()) {
+  detail::dpct_memset(q, dev_ptr, value, size);
+}
+/**
+ * @brief Sets 4 bytes data \p value to the first \p size elements starting from
+ * \p dev_ptr in \p q asynchronously.
+ * @param [in] dev_ptr Pointer to the virtual device memory address.
+ * @param [in] value The value to be set.
+ * @param [in] size Number of elements to be set to the value.
+ * @param [in] q The queue in which the operation is done.
+ */
+static void async_dpct_memset_d32(void *dev_ptr, unsigned int value, size_t size,
+                              sycl::queue &q = dpct::get_default_queue()) {
+  detail::dpct_memset(q, dev_ptr, value, size);
+}
+
+/**
+ * @brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p ptr in \p q
+ * synchronously.
+ * @param [in] ptr Pointer to the virtual device memory.
+ * @param [in] pitch The pitch size by number of elements, including padding.
+ * @param [in] val The value to be set.
+ * @param [in] x The width of memory region by number of elements.
+ * @param [in] y The height of memory region by number of elements.
+ * @param [in] q The queue in which the operation is done.
+ */
+static inline void dpct_memset(void *ptr, size_t pitch, int val, size_t x,
+                               size_t y,
+                               sycl::queue &q = get_default_queue()) {
+  sycl::event::wait(detail::dpct_memset<unsigned char>(q, ptr, pitch, val, x, y));
+}
+/**
+ * @brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by \p ptr in \p q
+ * synchronously.
+ * @param [in] ptr Pointer to the virtual device memory.
+ * @param [in] pitch The pitch size by number of elements, including padding.
+ * @param [in] val The value to be set.
+ * @param [in] x The width of memory region by number of elements.
+ * @param [in] y The height of memory region by number of elements.
+ * @param [in] q The queue in which the operation is done.
+ */
+static inline void dpct_memset_d16(void *ptr, size_t pitch, unsigned short val, size_t x,
+                               size_t y,
+                               sycl::queue &q = get_default_queue()) {
+  sycl::event::wait(detail::dpct_memset(q, ptr, pitch, val, x, y));
+}
+/**
+ * @brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by \p ptr in \p q
+ * synchronously.
+ * @param [in] ptr Pointer to the virtual device memory.
+ * @param [in] pitch The pitch size by number of elements, including padding.
+ * @param [in] val The value to be set.
+ * @param [in] x The width of memory region by number of elements.
+ * @param [in] y The height of memory region by number of elements.
+ * @param [in] q The queue in which the operation is done.
+ */
+static inline void dpct_memset_d32(void *ptr, size_t pitch, unsigned int val, size_t x,
+                               size_t y,
+                               sycl::queue &q = get_default_queue()) {
+  sycl::event::wait(detail::dpct_memset(q, ptr, pitch, val, x, y));
+}
+
+/**
+ * @brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p ptr in \p q
+ * asynchronously.
+ * @param [in] ptr Pointer to the virtual device memory.
+ * @param [in] pitch The pitch size by number of elements, including padding.
+ * @param [in] val The value to be set.
+ * @param [in] x The width of memory region by number of elements.
+ * @param [in] y The height of memory region by number of elements.
+ * @param [in] q The queue in which the operation is done.
+ */
+static inline void async_dpct_memset(void *ptr, size_t pitch, int val, size_t x,
+                                     size_t y,
+                                     sycl::queue &q = get_default_queue()) {
+  detail::dpct_memset<unsigned char>(q, ptr, pitch, val, x, y);
+}
+
+/**
+ * @brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by \p ptr in \p q
+ * asynchronously.
+ * @param [in] ptr Pointer to the virtual device memory.
+ * @param [in] pitch The pitch size by number of elements, including padding.
+ * @param [in] val The value to be set.
+ * @param [in] x The width of memory region by number of elements.
+ * @param [in] y The height of memory region by number of elements.
+ * @param [in] q The queue in which the operation is done.
+ */
+static inline void async_dpct_memset_d16(void *ptr, size_t pitch,
+                                         unsigned short val, size_t x, size_t y,
+                                         sycl::queue &q = get_default_queue()) {
+  detail::dpct_memset(q, ptr, pitch, val, x, y);
+}
+
+/**
+ * @brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by \p ptr in \p q
+ * asynchronously.
+ * @param [in] ptr Pointer to the virtual device memory.
+ * @param [in] pitch The pitch size by number of elements, including padding.
+ * @param [in] val The value to be set.
+ * @param [in] x The width of memory region by number of elements.
+ * @param [in] y The height of memory region by number of elements.
+ * @param [in] q The queue in which the operation is done.
+ */
+static inline void async_dpct_memset_d32(void *ptr, size_t pitch,
+                                         unsigned int val, size_t x, size_t y,
+                                         sycl::queue &q = get_default_queue()) {
+  detail::dpct_memset(q, ptr, pitch, val, x, y);
+}
+
+/**
+ * @brief Sets 1 byte data \p value to the 3D memory region pointed by \p data in \p q
+ * synchronously.
+ * @param [in] data Pointer to the pitched device memory region.
+ * @param [in] value The value to be set.
+ * @param [in] size 3D memory region by number of elements.
+ * @param [in] q The queue in which the operation is done.
+ */
+static inline void dpct_memset(pitched_data pitch, int val,
+                               sycl::range<3> size,
+                               sycl::queue &q = get_default_queue()) {
+  sycl::event::wait(detail::dpct_memset<unsigned char>(q, pitch, val, size));
+}
+
+/**
+ * @brief Sets 1 byte data \p value to the 3D memory region pointed by \p data in \p q
+ * asynchronously.
+ * @param [in] data Pointer to the pitched device memory region.
+ * @param [in] value The value to be set.
+ * @param [in] size 3D memory region by number of elements.
+ * @param [in] q The queue in which the operation is done.
+ */
+static inline void async_dpct_memset(pitched_data pitch, int val,
+                                     sycl::range<3> size,
+                                     sycl::queue &q = get_default_queue()) {
+  detail::dpct_memset<unsigned char>(q, pitch, val, size);
+}
+
+/// dpct accessor used as device function parameter.
+template <class T, memory_region Memory, size_t Dimension> class accessor;
+template <class T, memory_region Memory> class accessor<T, Memory, 3> {
+public:
+  using memory_t = detail::memory_traits<Memory, T>;
+  using element_t = typename memory_t::element_t;
+  using pointer_t = typename memory_t::pointer_t;
+  using accessor_t = typename memory_t::template accessor_t<3>;
+  accessor(pointer_t data, const sycl::range<3> &in_range)
+      : _data(data), _range(in_range) {}
+  template <memory_region M = Memory>
+  accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
+      : accessor(acc, acc.get_range()) {}
+  accessor(const accessor_t &acc, const sycl::range<3> &in_range)
+      : accessor(acc.get_pointer(), in_range) {}
+  accessor<T, Memory, 2> operator[](size_t index) const {
+    sycl::range<2> sub(_range.get(1), _range.get(2));
+    return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
+  }
+
+  pointer_t get_ptr() const { return _data; }
+
+private:
+  pointer_t _data;
+  sycl::range<3> _range;
+};
+template <class T, memory_region Memory> class accessor<T, Memory, 2> {
+public:
+  using memory_t = detail::memory_traits<Memory, T>;
+  using element_t = typename memory_t::element_t;
+  using pointer_t = typename memory_t::pointer_t;
+  using accessor_t = typename memory_t::template accessor_t<2>;
+  accessor(pointer_t data, const sycl::range<2> &in_range)
+      : _data(data), _range(in_range) {}
+  template <memory_region M = Memory>
+  accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
+      : accessor(acc, acc.get_range()) {}
+  accessor(const accessor_t &acc, const sycl::range<2> &in_range)
+      : accessor(acc.get_pointer(), in_range) {}
+
+  pointer_t operator[](size_t index) const {
+    return _data + _range.get(1) * index;
+  }
+
+  pointer_t get_ptr() const { return _data; }
+
+private:
+  pointer_t _data;
+  sycl::range<2> _range;
+};
+
+namespace detail {
+/// Device variable with address space of shared, global or constant.
+template <class T, memory_region Memory, size_t Dimension>
+class device_memory {
+public:
+  using accessor_t =
+      typename detail::memory_traits<Memory, T>::template accessor_t<Dimension>;
+  using value_t = typename detail::memory_traits<Memory, T>::value_t;
+  using dpct_accessor_t = dpct::accessor<T, Memory, Dimension>;
+
+  device_memory() : device_memory(sycl::range<Dimension>(1)) {}
+
+  /// Constructor of 1-D array with initializer list
+  device_memory(
+      const sycl::range<Dimension> &in_range,
+      std::initializer_list<value_t> &&init_list)
+      : device_memory(in_range) {
+    assert(init_list.size() <= in_range.size());
+    _host_ptr = (value_t *)std::malloc(_size);
+    std::memset(_host_ptr, 0, _size);
+    std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
+  }
+
+  /// Constructor of 2-D array with initializer list
+  template <size_t D = Dimension>
+  device_memory(
+      const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
+      std::initializer_list<std::initializer_list<value_t>> &&init_list)
+      : device_memory(in_range) {
+    assert(init_list.size() <= in_range[0]);
+    _host_ptr = (value_t *)std::malloc(_size);
+    std::memset(_host_ptr, 0, _size);
+    auto tmp_data = _host_ptr;
+    for (auto sub_list : init_list) {
+      assert(sub_list.size() <= in_range[1]);
+      std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T));
+      tmp_data += in_range[1];
+    }
+  }
+
+  /// Constructor with range
+  device_memory(const sycl::range<Dimension> &range_in)
+      : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
+        _host_ptr(nullptr), _device_ptr(nullptr) {
+    static_assert(
+        (Memory == global) || (Memory == constant) || (Memory == shared),
+        "device memory region should be global, constant or shared");
+    // Make sure that singleton class mem_mgr and dev_mgr will destruct later
+    // than this.
+    detail::mem_mgr::instance();
+    dev_mgr::instance();
+  }
+
+  /// Constructor with range
+  template <class... Args>
+  device_memory(Args... Arguments)
+      : device_memory(sycl::range<Dimension>(Arguments...)) {}
+
+  ~device_memory() {
+    if (_device_ptr && !_reference)
+      dpct::dpct_free(_device_ptr);
+    if (_host_ptr)
+      std::free(_host_ptr);
+  }
+
+  /// Allocate memory with default queue, and init memory if has initial value.
+  void init() {
+    init(dpct::get_default_queue());
+  }
+  /// Allocate memory with specified queue, and init memory if has initial value.
+  void init(sycl::queue &q) {
+    if (_device_ptr)
+      return;
+    if (!_size)
+      return;
+    allocate_device(q);
+    if (_host_ptr)
+      detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size, host_to_device);
+  }
+
+  /// The variable is assigned to a device pointer.
+  void assign(value_t *src, size_t size) {
+    this->~device_memory();
+    new (this) device_memory(src, size);
+  }
+
+  /// Get memory pointer of the memory object, which is virtual pointer when
+  /// usm is not used, and device pointer when usm is used.
+  value_t *get_ptr() {
+    return get_ptr(get_default_queue());
+  }
+  /// Get memory pointer of the memory object, which is virtual pointer when
+  /// usm is not used, and device pointer when usm is used.
+  value_t *get_ptr(sycl::queue &q) {
+    init(q);
+    return _device_ptr;
+  }
+
+  /// Get the device memory object size in bytes.
+  size_t get_size() { return _size; }
+
+  template <size_t D = Dimension>
+  typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
+    init();
+#ifdef DPCT_USM_LEVEL_NONE
+    return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
+               _device_ptr)
+        .template get_access<sycl::access_mode::read_write>()[index];
+#else
+    return _device_ptr[index];
+#endif // DPCT_USM_LEVEL_NONE
+  }
+
+#ifdef DPCT_USM_LEVEL_NONE
+  /// Get sycl::accessor for the device memory object when usm is not used.
+  accessor_t get_access(sycl::handler &cgh) {
+    return get_buffer(_device_ptr)
+        .template reinterpret<T, Dimension>(_range)
+        .template get_access<detail::memory_traits<Memory, T>::mode,
+                             detail::memory_traits<Memory, T>::target>(cgh);
+  }
+#else
+  /// Get dpct::accessor with dimension info for the device memory object
+  /// when usm is used and dimension is greater than 1.
+  template <size_t D = Dimension>
+  typename std::enable_if<D != 1, dpct_accessor_t>::type
+  get_access(sycl::handler &cgh) {
+    return dpct_accessor_t((T *)_device_ptr, _range);
+  }
+#endif // DPCT_USM_LEVEL_NONE
+
+private:
+  device_memory(value_t *memory_ptr, size_t size)
+      : _size(size), _range(size / sizeof(T)), _reference(true),
+        _device_ptr(memory_ptr) {}
+
+  void allocate_device(sycl::queue &q) {
+#ifndef DPCT_USM_LEVEL_NONE
+    if (Memory == shared) {
+      _device_ptr = (value_t *)sycl::malloc_shared(
+          _size, q.get_device(), q.get_context());
+      return;
+    }
+#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
+    if (Memory == constant) {
+      _device_ptr = (value_t *)sycl::malloc_device(
+          _size, q.get_device(), q.get_context(),
+          sycl::ext::oneapi::property::usm::device_read_only());
+      return;
+    }
+#endif
+#endif
+    _device_ptr = (value_t *)detail::dpct_malloc(_size, q);
+  }
+
+  size_t _size;
+  sycl::range<Dimension> _range;
+  bool _reference;
+  value_t *_host_ptr;
+  value_t *_device_ptr;
+};
+template <class T, memory_region Memory>
+class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
+public:
+  using base = device_memory<T, Memory, 1>;
+  using value_t = typename base::value_t;
+  using accessor_t =
+      typename detail::memory_traits<Memory, T>::template accessor_t<0>;
+
+  /// Constructor with initial value.
+  device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {}
+
+  /// Default constructor
+  device_memory() : base(1) {}
+
+#ifdef DPCT_USM_LEVEL_NONE
+  /// Get sycl::accessor for the device memory object when usm is not used.
+  accessor_t get_access(sycl::handler &cgh) {
+    auto buf = get_buffer(base::get_ptr())
+                   .template reinterpret<T, 1>(sycl::range<1>(1));
+    return accessor_t(buf, cgh);
+  }
+#endif // DPCT_USM_LEVEL_NONE
+};
+}
+
+template <class T, size_t Dimension>
+using global_memory = detail::device_memory<T, global, Dimension>;
+template <class T, size_t Dimension>
+using constant_memory = detail::device_memory<T, constant, Dimension>;
+template <class T, size_t Dimension>
+using shared_memory = detail::device_memory<T, shared, Dimension>;
+
+// dpct::deprecated:: is for functionality that was introduced for compatibility
+// purpose, but relies on deprecated C++ features, which are either removed or
+// will be removed in the future standards.
+// Direct use of deprecated functionality in this namespace should be avoided.
+namespace deprecated {
+
+template <typename T>
+using usm_host_allocator = detail::deprecated::usm_allocator<T, sycl::usm::alloc::host>;
+
+template <typename T>
+using usm_device_allocator = detail::deprecated::usm_allocator<T, sycl::usm::alloc::shared>;
+} // namespace deprecated
+
+class pointer_attributes {
+public:
+  void init(const void *ptr,
+              sycl::queue &q = dpct::get_default_queue()) {
+#ifdef DPCT_USM_LEVEL_NONE
+    throw std::runtime_error(
+          "dpct::pointer_attributes: only works for USM pointer.");
+#else
+    memory_type = sycl::get_pointer_type(ptr, q.get_context());
+    if (memory_type == sycl::usm::alloc::unknown) {
+      device_id = -1;
+      return;
+    }
+    device_pointer = (memory_type !=
+                        sycl::usm::alloc::unknown) ? ptr : nullptr;
+    host_pointer = (memory_type !=
+                        sycl::usm::alloc::unknown) &&
+                   (memory_type != sycl::usm::alloc::device) ? ptr : nullptr;
+    sycl::device device_obj = sycl::get_pointer_device(ptr, q.get_context());
+    device_id = dpct::dev_mgr::instance().get_device_id(device_obj);
+#endif
+  }
+
+  sycl::usm::alloc get_memory_type() {
+    return memory_type;
+  }
+
+  const void *get_device_pointer() {
+    return device_pointer;
+  }
+
+  const void *get_host_pointer() {
+    return host_pointer;
+  }
+
+  bool is_memory_shared() {
+    return memory_type == sycl::usm::alloc::shared;
+  }
+
+  unsigned int get_device_id() {
+    return device_id;
+  }
+
+private:
+  sycl::usm::alloc memory_type = sycl::usm::alloc::unknown;
+  const void *device_pointer = nullptr;
+  const void *host_pointer = nullptr;
+  unsigned int device_id = -1;
+};
+} // namespace dpct
+#endif // __DPCT_MEMORY_HPP__
diff --git a/dpct/rng_utils.hpp b/dpct/rng_utils.hpp
new file mode 100644
index 0000000000000..6c79ca564639c
--- /dev/null
+++ b/dpct/rng_utils.hpp
@@ -0,0 +1,535 @@
+//==---- rng_utils.hpp ----------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_RNG_UTILS_HPP__
+#define __DPCT_RNG_UTILS_HPP__
+
+#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
+#include <oneapi/mkl/rng/device.hpp>
+#endif
+#include "device.hpp"
+#include "lib_common_utils.hpp"
+
+namespace dpct {
+namespace rng {
+#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
+namespace device {
+/// The random number generator on device.
+/// \tparam engine_t The device random number generator engine. It can only be
+/// oneapi::mkl::rng::device::mrg32k3a<1> or
+/// oneapi::mkl::rng::device::mrg32k3a<4> or
+/// oneapi::mkl::rng::device::philox4x32x10<1> or
+/// oneapi::mkl::rng::device::philox4x32x10<4>.
+template <typename engine_t> class rng_generator {
+  static_assert(
+      std::disjunction_v<
+          std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
+          std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<4>>,
+          std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
+          std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<4>>,
+          std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>,
+      "engine_t can only be oneapi::mkl::rng::device::mrg32k3a<1> or "
+      "oneapi::mkl::rng::device::mrg32k3a<4> or "
+      "oneapi::mkl::rng::device::philox4x32x10<1> or "
+      "oneapi::mkl::rng::device::philox4x32x10<4> or "
+      "oneapi::mkl::rng::device::mcg59<1>.");
+  static constexpr bool _is_engine_vec_size_one = std::disjunction_v<
+      std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
+      std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
+      std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>;
+  static constexpr std::uint64_t default_seed = 0;
+  oneapi::mkl::rng::device::bits<std::uint32_t> _distr_bits;
+  oneapi::mkl::rng::device::uniform_bits<std::uint32_t> _distr_uniform_bits;
+  oneapi::mkl::rng::device::gaussian<float> _distr_gaussian_float;
+  oneapi::mkl::rng::device::gaussian<double> _distr_gaussian_double;
+  oneapi::mkl::rng::device::lognormal<float> _distr_lognormal_float;
+  oneapi::mkl::rng::device::lognormal<double> _distr_lognormal_double;
+  oneapi::mkl::rng::device::poisson<std::uint32_t> _distr_poisson;
+  oneapi::mkl::rng::device::uniform<float> _distr_uniform_float;
+  oneapi::mkl::rng::device::uniform<double> _distr_uniform_double;
+  engine_t _engine;
+
+public:
+  /// Default constructor of rng_generator
+  rng_generator() { _engine = engine_t(default_seed); }
+  /// Constructor of rng_generator if engine type is not mcg59
+  /// \param [in] seed The seed to initialize the engine state.
+  /// \param [in] num_to_skip Set the number of elements need to be skipped.
+  /// The number is calculated as: num_to_skip[0] + num_to_skip[1] * 2^64 +
+  /// num_to_skip[2] * 2^128 + ... + num_to_skip[n-1] * 2^(64*(n-1))
+  template <typename T = engine_t,
+            typename std::enable_if<!std::is_same_v<
+                T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
+  rng_generator(std::uint64_t seed,
+                std::initializer_list<std::uint64_t> num_to_skip) {
+    _engine = engine_t(seed, num_to_skip);
+  }
+  /// Constructor of rng_generator if engine type is mcg59
+  /// \param [in] seed The seed to initialize the engine state.
+  /// \param [in] num_to_skip Set the number of elements need to be skipped.
+  template <typename T = engine_t,
+            typename std::enable_if<std::is_same_v<
+                T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
+  rng_generator(std::uint64_t seed, std::uint64_t num_to_skip) {
+    _engine = engine_t(seed, num_to_skip);
+  }
+
+  /// Generate random number(s) obeys distribution \tparam distr_t.
+  /// \tparam T The distribution of the random number. It can only be
+  /// oneapi::mkl::rng::device::bits<std::uint32_t>,
+  /// oneapi::mkl::rng::device::uniform_bits<std::uint32_t>,
+  /// oneapi::mkl::rng::device::gaussian<float>,
+  /// oneapi::mkl::rng::device::gaussian<double>,
+  /// oneapi::mkl::rng::device::lognormal<float>,
+  /// oneapi::mkl::rng::device::lognormal<double>,
+  /// oneapi::mkl::rng::device::poisson<std::uint32_t>,
+  /// oneapi::mkl::rng::device::uniform<float> or
+  /// oneapi::mkl::rng::device::uniform<double>
+  /// \tparam vec_size The length of the return vector. It can only be 1, 2
+  /// or 4.
+  /// \param distr_params The parameter(s) for lognormal or poisson
+  /// distribution.
+  /// \return The vector of the random number(s).
+  template <typename distr_t, int vec_size, class... distr_params_t>
+  auto generate(distr_params_t... distr_params) {
+    static_assert(vec_size == 1 || vec_size == 2 || vec_size == 4,
+                  "vec_size is not supported.");
+    static_assert(
+        std::disjunction_v<
+            std::is_same<distr_t,
+                         oneapi::mkl::rng::device::bits<std::uint32_t>>,
+            std::is_same<distr_t,
+                         oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>,
+            std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<float>>,
+            std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<double>>,
+            std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<float>>,
+            std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<double>>,
+            std::is_same<distr_t,
+                         oneapi::mkl::rng::device::poisson<std::uint32_t>>,
+            std::is_same<distr_t, oneapi::mkl::rng::device::uniform<float>>,
+            std::is_same<distr_t, oneapi::mkl::rng::device::uniform<double>>>,
+        "distribution is not supported.");
+
+    if constexpr (std::is_same_v<
+                      distr_t, oneapi::mkl::rng::device::bits<std::uint32_t>>) {
+      return generate_vec<vec_size>(_distr_bits);
+    }
+    if constexpr (std::is_same_v<
+                      distr_t,
+                      oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>) {
+      return generate_vec<vec_size>(_distr_uniform_bits);
+    }
+    if constexpr (std::is_same_v<distr_t,
+                                 oneapi::mkl::rng::device::gaussian<float>>) {
+      return generate_vec<vec_size>(_distr_gaussian_float);
+    }
+    if constexpr (std::is_same_v<distr_t,
+                                 oneapi::mkl::rng::device::gaussian<double>>) {
+      return generate_vec<vec_size>(_distr_gaussian_double);
+    }
+    if constexpr (std::is_same_v<distr_t,
+                                 oneapi::mkl::rng::device::lognormal<float>>) {
+      return generate_vec<vec_size>(_distr_lognormal_float, distr_params...,
+                                    0.0f, 1.0f);
+    }
+    if constexpr (std::is_same_v<distr_t,
+                                 oneapi::mkl::rng::device::lognormal<double>>) {
+      return generate_vec<vec_size>(_distr_lognormal_double, distr_params...,
+                                    0.0, 1.0);
+    }
+    if constexpr (std::is_same_v<distr_t, oneapi::mkl::rng::device::poisson<
+                                              std::uint32_t>>) {
+      return generate_vec<vec_size>(_distr_poisson, distr_params...);
+    }
+    if constexpr (std::is_same_v<distr_t,
+                                 oneapi::mkl::rng::device::uniform<float>>) {
+      return generate_vec<vec_size>(_distr_uniform_float);
+    }
+    if constexpr (std::is_same_v<distr_t,
+                                 oneapi::mkl::rng::device::uniform<double>>) {
+      return generate_vec<vec_size>(_distr_uniform_double);
+    }
+  }
+
+  /// Get the random number generator engine.
+  /// \return The reference of the internal random number generator engine.
+  engine_t &get_engine() { return _engine; }
+
+private:
+  template <int vec_size, typename distr_t, class... distr_params_t>
+  auto generate_vec(distr_t &distr, distr_params_t... distr_params) {
+    if constexpr (sizeof...(distr_params_t)) {
+      typename distr_t::param_type pt(distr_params...);
+      distr.param(pt);
+    }
+    if constexpr (vec_size == 4) {
+      if constexpr (_is_engine_vec_size_one) {
+        sycl::vec<typename distr_t::result_type, 4> res;
+        res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
+        res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
+        res.z() = oneapi::mkl::rng::device::generate(distr, _engine);
+        res.w() = oneapi::mkl::rng::device::generate(distr, _engine);
+        return res;
+      } else {
+        return oneapi::mkl::rng::device::generate(distr, _engine);
+      }
+    } else if constexpr (vec_size == 1) {
+      if constexpr (_is_engine_vec_size_one) {
+        return oneapi::mkl::rng::device::generate(distr, _engine);
+      } else {
+        return oneapi::mkl::rng::device::generate_single(distr, _engine);
+      }
+    } else if constexpr (vec_size == 2) {
+      if constexpr (_is_engine_vec_size_one) {
+        sycl::vec<typename distr_t::result_type, 2> res;
+        res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
+        res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
+        return res;
+      } else {
+        sycl::vec<typename distr_t::result_type, 2> res;
+        res.x() = oneapi::mkl::rng::device::generate_single(distr, _engine);
+        res.y() = oneapi::mkl::rng::device::generate_single(distr, _engine);
+        return res;
+      }
+    }
+  }
+};
+
+} // namespace device
+#endif
+
+namespace host {
+namespace detail {
+class rng_generator_base {
+public:
+  /// Set the seed of host rng_generator.
+  /// \param seed The engine seed.
+  virtual void set_seed(const std::uint64_t seed) = 0;
+
+  /// Set the dimensions of host rng_generator.
+  /// \param dimensions The engine dimensions.
+  virtual void set_dimensions(const std::uint32_t dimensions) = 0;
+
+  /// Set the queue of host rng_generator.
+  /// \param queue The engine queue.
+  virtual void set_queue(sycl::queue *queue) = 0;
+
+  /// Generate unsigned int random number(s) with 'uniform_bits' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  virtual inline void generate_uniform_bits(unsigned int *output,
+                                            std::int64_t n) = 0;
+
+  /// Generate unsigned long long random number(s) with 'uniform_bits'
+  /// distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  virtual inline void generate_uniform_bits(unsigned long long *output,
+                                            std::int64_t n) = 0;
+
+  /// Generate float random number(s) with 'lognormal' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param m Mean of associated normal distribution
+  /// \param s Standard deviation of associated normal distribution.
+  virtual inline void generate_lognormal(float *output, std::int64_t n, float m,
+                                         float s) = 0;
+
+  /// Generate double random number(s) with 'lognormal' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param m Mean of associated normal distribution
+  /// \param s Standard deviation of associated normal distribution.
+  virtual inline void generate_lognormal(double *output, std::int64_t n,
+                                         double m, double s) = 0;
+
+  /// Generate float random number(s) with 'gaussian' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param mean Mean of normal distribution
+  /// \param stddev Standard deviation of normal distribution.
+  virtual inline void generate_gaussian(float *output, std::int64_t n,
+                                        float mean, float stddev) = 0;
+
+  /// Generate double random number(s) with 'gaussian' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param mean Mean of normal distribution
+  /// \param stddev Standard deviation of normal distribution.
+  virtual inline void generate_gaussian(double *output, std::int64_t n,
+                                        double mean, double stddev) = 0;
+
+  /// Generate unsigned int random number(s) with 'poisson' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param lambda Lambda for the Poisson distribution.
+  virtual inline void generate_poisson(unsigned int *output, std::int64_t n,
+                                       double lambda) = 0;
+
+  /// Generate float random number(s) with 'uniform' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  virtual inline void generate_uniform(float *output, std::int64_t n) = 0;
+
+  /// Generate double random number(s) with 'uniform' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  virtual inline void generate_uniform(double *output, std::int64_t n) = 0;
+
+  /// Skip ahead several random number(s).
+  /// \param num_to_skip The number of random numbers to be skipped.
+  virtual void skip_ahead(const std::uint64_t num_to_skip) = 0;
+
+  /// Set the direction numbers of host rng_generator. Only Sobol engine
+  /// supports this method.
+  /// \param direction_numbers The engine direction numbers.
+  virtual void set_direction_numbers(
+      const std::vector<std::uint32_t> &direction_numbers) = 0;
+
+protected:
+  sycl::queue *_queue{&dpct::get_default_queue()};
+  std::uint64_t _seed{0};
+  std::uint32_t _dimensions{1};
+  std::vector<std::uint32_t> _direction_numbers;
+};
+
+/// The random number generator on host.
+template <typename engine_t = oneapi::mkl::rng::philox4x32x10>
+class rng_generator : public rng_generator_base {
+public:
+  /// Constructor of rng_generator.
+  rng_generator() : _engine(create_engine(_queue, _seed, _dimensions)) {}
+
+  /// Set the seed of host rng_generator.
+  /// \param seed The engine seed.
+  void set_seed(const std::uint64_t seed) {
+    if (seed == _seed) {
+      return;
+    }
+    _seed = seed;
+    _engine = create_engine(_queue, _seed, _dimensions);
+  }
+
+  /// Set the dimensions of host rng_generator.
+  /// \param dimensions The engine dimensions.
+  void set_dimensions(const std::uint32_t dimensions) {
+    if (dimensions == _dimensions) {
+      return;
+    }
+    _dimensions = dimensions;
+    _engine = create_engine(_queue, _seed, _dimensions);
+  }
+
+  /// Set the queue of host rng_generator.
+  /// \param queue The engine queue.
+  void set_queue(sycl::queue *queue) {
+    if (queue == _queue) {
+      return;
+    }
+    _queue = queue;
+    _engine = create_engine(_queue, _seed, _dimensions);
+  }
+
+  /// Set the direction numbers of Sobol host rng_generator.
+  /// \param direction_numbers The user-defined direction numbers.
+  void
+  set_direction_numbers(const std::vector<std::uint32_t> &direction_numbers) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
+                             "Interfaces Project does not support this API.");
+#else
+    if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::sobol>) {
+      if (direction_numbers == _direction_numbers) {
+        return;
+      }
+      _direction_numbers = direction_numbers;
+      _engine = oneapi::mkl::rng::sobol(*_queue, _direction_numbers);
+    } else {
+      throw std::runtime_error("Only Sobol engine supports this method.");
+    }
+#endif
+  }
+
+  /// Generate unsigned int random number(s) with 'uniform_bits' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  inline void generate_uniform_bits(unsigned int *output, std::int64_t n) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
+                             "Interfaces Project does not support this API.");
+#else
+    static_assert(sizeof(unsigned int) == sizeof(std::uint32_t));
+    generate<oneapi::mkl::rng::uniform_bits<std::uint32_t>>(
+        (std::uint32_t *)output, n);
+#endif
+  }
+
+  /// Generate unsigned long long random number(s) with 'uniform_bits'
+  /// distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  inline void generate_uniform_bits(unsigned long long *output,
+                                    std::int64_t n) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
+                             "Interfaces Project does not support this API.");
+#else
+    static_assert(sizeof(unsigned long long) == sizeof(std::uint64_t));
+    generate<oneapi::mkl::rng::uniform_bits<std::uint64_t>>(
+        (std::uint64_t *)output, n);
+#endif
+  }
+
+  /// Generate float random number(s) with 'lognormal' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param m Mean of associated normal distribution
+  /// \param s Standard deviation of associated normal distribution.
+  inline void generate_lognormal(float *output, std::int64_t n, float m,
+                                 float s) {
+    generate<oneapi::mkl::rng::lognormal<float>>(output, n, m, s);
+  }
+
+  /// Generate double random number(s) with 'lognormal' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param m Mean of associated normal distribution
+  /// \param s Standard deviation of associated normal distribution.
+  inline void generate_lognormal(double *output, std::int64_t n, double m,
+                                 double s) {
+    generate<oneapi::mkl::rng::lognormal<double>>(output, n, m, s);
+  }
+
+  /// Generate float random number(s) with 'gaussian' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param mean Mean of normal distribution
+  /// \param stddev Standard deviation of normal distribution.
+  inline void generate_gaussian(float *output, std::int64_t n, float mean,
+                                float stddev) {
+    generate<oneapi::mkl::rng::gaussian<float>>(output, n, mean, stddev);
+  }
+
+  /// Generate double random number(s) with 'gaussian' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param mean Mean of normal distribution
+  /// \param stddev Standard deviation of normal distribution.
+  inline void generate_gaussian(double *output, std::int64_t n, double mean,
+                                double stddev) {
+    generate<oneapi::mkl::rng::gaussian<double>>(output, n, mean, stddev);
+  }
+
+  /// Generate unsigned int random number(s) with 'poisson' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  /// \param lambda Lambda for the Poisson distribution.
+  inline void generate_poisson(unsigned int *output, std::int64_t n,
+                               double lambda) {
+    generate<oneapi::mkl::rng::poisson<unsigned int>>(output, n, lambda);
+  }
+
+  /// Generate float random number(s) with 'uniform' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  inline void generate_uniform(float *output, std::int64_t n) {
+    generate<oneapi::mkl::rng::uniform<float>>(output, n);
+  }
+
+  /// Generate double random number(s) with 'uniform' distribution.
+  /// \param output The pointer of the first random number.
+  /// \param n The number of random numbers.
+  inline void generate_uniform(double *output, std::int64_t n) {
+    generate<oneapi::mkl::rng::uniform<double>>(output, n);
+  }
+
+  /// Skip ahead several random number(s).
+  /// \param num_to_skip The number of random numbers to be skipped.
+  void skip_ahead(const std::uint64_t num_to_skip) {
+#ifndef __INTEL_MKL__
+    oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
+#else
+    if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::mt2203>)
+      throw std::runtime_error("no skip_ahead method of mt2203 engine.");
+    else
+      oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
+#endif
+  }
+
+private:
+  static inline engine_t create_engine(sycl::queue *queue,
+                                       const std::uint64_t seed,
+                                       const std::uint32_t dimensions) {
+#ifdef __INTEL_MKL__
+    return std::is_same_v<engine_t, oneapi::mkl::rng::sobol>
+               ? engine_t(*queue, dimensions)
+               : engine_t(*queue, seed);
+#else
+    return engine_t(*queue, seed);
+#endif
+  }
+
+  template <typename distr_t, typename buffer_t, class... distr_params_t>
+  void generate(buffer_t *output, const std::int64_t n,
+                const distr_params_t... distr_params) {
+    auto output_buf = dpct::detail::get_memory<buffer_t>(output);
+    oneapi::mkl::rng::generate(distr_t(distr_params...), _engine, n,
+                               output_buf);
+  }
+  engine_t _engine{};
+};
+} // namespace detail
+} // namespace host
+
+enum class random_engine_type {
+  philox4x32x10,
+  mrg32k3a,
+  mt2203,
+  mt19937,
+  sobol,
+  mcg59
+};
+
+typedef std::shared_ptr<rng::host::detail::rng_generator_base> host_rng_ptr;
+
+/// Create a host random number generator.
+/// \param type The random engine type.
+/// \return The pointer of random number generator.
+inline host_rng_ptr create_host_rng(const random_engine_type type) {
+  switch (type) {
+  case random_engine_type::philox4x32x10:
+    return std::make_shared<
+        rng::host::detail::rng_generator<oneapi::mkl::rng::philox4x32x10>>();
+  case random_engine_type::mrg32k3a:
+    return std::make_shared<
+        rng::host::detail::rng_generator<oneapi::mkl::rng::mrg32k3a>>();
+#ifndef __INTEL_MKL__
+    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
+                             "Interfaces Project does not support this API.");
+#else
+  case random_engine_type::mt2203:
+    return std::make_shared<
+        rng::host::detail::rng_generator<oneapi::mkl::rng::mt2203>>();
+  case random_engine_type::mt19937:
+    return std::make_shared<
+        rng::host::detail::rng_generator<oneapi::mkl::rng::mt19937>>();
+  case random_engine_type::sobol:
+    return std::make_shared<
+        rng::host::detail::rng_generator<oneapi::mkl::rng::sobol>>();
+  case random_engine_type::mcg59:
+    return std::make_shared<
+        rng::host::detail::rng_generator<oneapi::mkl::rng::mcg59>>();
+#endif
+  }
+}
+} // namespace rng
+} // namespace dpct
+
+#endif // __DPCT_RNG_UTILS_HPP__
diff --git a/dpct/sparse_utils.hpp b/dpct/sparse_utils.hpp
new file mode 100644
index 0000000000000..b4f1ae4bcf330
--- /dev/null
+++ b/dpct/sparse_utils.hpp
@@ -0,0 +1,1385 @@
+//==---- sparse_utils.hpp -------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_SPARSE_UTILS_HPP__
+#define __DPCT_SPARSE_UTILS_HPP__
+
+#include "lib_common_utils.hpp"
+#include <oneapi/mkl.hpp>
+#include <sycl/sycl.hpp>
+
+namespace dpct {
+namespace sparse {
+/// Describes properties of a sparse matrix.
+/// The properties are matrix type, diag, uplo and index base.
+class matrix_info {
+public:
+  /// Matrix types are:
+  /// ge: General matrix
+  /// sy: Symmetric matrix
+  /// he: Hermitian matrix
+  /// tr: Triangular matrix
+  enum class matrix_type : int { ge = 0, sy, he, tr };
+
+  auto get_matrix_type() const { return _matrix_type; }
+  auto get_diag() const { return _diag; }
+  auto get_uplo() const { return _uplo; }
+  auto get_index_base() const { return _index_base; }
+  void set_matrix_type(matrix_type mt) { _matrix_type = mt; }
+  void set_diag(oneapi::mkl::diag d) { _diag = d; }
+  void set_uplo(oneapi::mkl::uplo u) { _uplo = u; }
+  void set_index_base(oneapi::mkl::index_base ib) { _index_base = ib; }
+
+private:
+  matrix_type _matrix_type = matrix_type::ge;
+  oneapi::mkl::diag _diag = oneapi::mkl::diag::nonunit;
+  oneapi::mkl::uplo _uplo = oneapi::mkl::uplo::upper;
+  oneapi::mkl::index_base _index_base = oneapi::mkl::index_base::zero;
+};
+
+enum class conversion_scope : int { index = 0, index_and_value };
+
+namespace detail {
+template <template <typename> typename functor_t, typename... args_t>
+inline void spblas_shim(library_data_t type, args_t &&...args) {
+  switch (type) {
+  case library_data_t::real_float: {
+    functor_t<float>()(std::forward<args_t>(args)...);
+    break;
+  }
+  case library_data_t::real_double: {
+    functor_t<double>()(std::forward<args_t>(args)...);
+    break;
+  }
+  case library_data_t::complex_float: {
+    functor_t<std::complex<float>>()(std::forward<args_t>(args)...);
+    break;
+  }
+  case library_data_t::complex_double: {
+    functor_t<std::complex<double>>()(std::forward<args_t>(args)...);
+    break;
+  }
+  default:
+    throw std::runtime_error("The data type is not supported.");
+  }
+}
+
+template <typename T> struct csrmv_impl {
+  void operator()(sycl::queue &queue, oneapi::mkl::transpose trans,
+                  int num_rows, int num_cols, const void *alpha,
+                  const std::shared_ptr<matrix_info> info, const void *val,
+                  const int *row_ptr, const int *col_ind, const void *x,
+                  const void *beta, void *y) {
+#ifndef __INTEL_MKL__
+    throw std::runtime_error(
+        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
+        "Project does not support this API.");
+#else
+    using Ty = typename dpct::DataType<T>::T2;
+    auto alpha_value =
+        dpct::detail::get_value(reinterpret_cast<const Ty *>(alpha), queue);
+    auto beta_value =
+        dpct::detail::get_value(reinterpret_cast<const Ty *>(beta), queue);
+
+    oneapi::mkl::sparse::matrix_handle_t *sparse_matrix_handle =
+        new oneapi::mkl::sparse::matrix_handle_t;
+    oneapi::mkl::sparse::init_matrix_handle(sparse_matrix_handle);
+    auto data_row_ptr = dpct::detail::get_memory<int>(row_ptr);
+    auto data_col_ind = dpct::detail::get_memory<int>(col_ind);
+    auto data_val = dpct::detail::get_memory<Ty>(val);
+    oneapi::mkl::sparse::set_csr_data(queue, *sparse_matrix_handle, num_rows,
+                                      num_cols, info->get_index_base(),
+                                      data_row_ptr, data_col_ind, data_val);
+
+    auto data_x = dpct::detail::get_memory<Ty>(x);
+    auto data_y = dpct::detail::get_memory<Ty>(y);
+    switch (info->get_matrix_type()) {
+    case matrix_info::matrix_type::ge: {
+      oneapi::mkl::sparse::optimize_gemv(queue, trans, *sparse_matrix_handle);
+      oneapi::mkl::sparse::gemv(queue, trans, alpha_value,
+                                *sparse_matrix_handle, data_x, beta_value,
+                                data_y);
+      break;
+    }
+    case matrix_info::matrix_type::sy: {
+      oneapi::mkl::sparse::symv(queue, info->get_uplo(), alpha_value,
+                                *sparse_matrix_handle, data_x, beta_value,
+                                data_y);
+      break;
+    }
+    case matrix_info::matrix_type::tr: {
+      oneapi::mkl::sparse::optimize_trmv(queue, info->get_uplo(), trans,
+                                         info->get_diag(),
+                                         *sparse_matrix_handle);
+      oneapi::mkl::sparse::trmv(
+          queue, info->get_uplo(), trans, info->get_diag(), alpha_value,
+          *sparse_matrix_handle, data_x, beta_value, data_y);
+      break;
+    }
+    default:
+      throw std::runtime_error(
+          "the spmv does not support matrix_info::matrix_type::he");
+    }
+
+    sycl::event e =
+        oneapi::mkl::sparse::release_matrix_handle(queue, sparse_matrix_handle);
+    queue.submit([&](sycl::handler &cgh) {
+      cgh.depends_on(e);
+      cgh.host_task([=] { delete sparse_matrix_handle; });
+    });
+#endif
+  }
+};
+} // namespace detail
+
+/// Computes a CSR format sparse matrix-dense vector product.
+/// y = alpha * op(A) * x + beta * y
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans The operation applied to the matrix A.
+/// \param [in] num_rows Number of rows of the matrix A.
+/// \param [in] num_cols Number of columns of the matrix A.
+/// \param [in] alpha Scaling factor for the matrix A.
+/// \param [in] info Matrix info of the matrix A.
+/// \param [in] val An array containing the non-zero elements of the matrix A.
+/// \param [in] row_ptr An array of length \p num_rows + 1.
+/// \param [in] col_ind An array containing the column indices in index-based
+/// numbering.
+/// \param [in] x Data of the vector x.
+/// \param [in] beta Scaling factor for the vector x.
+/// \param [in, out] y Data of the vector y.
+template <typename T>
+void csrmv(sycl::queue &queue, oneapi::mkl::transpose trans, int num_rows,
+           int num_cols, const T *alpha,
+           const std::shared_ptr<matrix_info> info, const T *val,
+           const int *row_ptr, const int *col_ind, const T *x, const T *beta,
+           T *y) {
+  detail::csrmv_impl<T>()(queue, trans, num_rows, num_cols, alpha, info, val,
+                          row_ptr, col_ind, x, beta, y);
+}
+
+/// Computes a CSR format sparse matrix-dense vector product.
+/// y = alpha * op(A) * x + beta * y
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans The operation applied to the matrix A.
+/// \param [in] num_rows Number of rows of the matrix A.
+/// \param [in] num_cols Number of columns of the matrix A.
+/// \param [in] alpha Scaling factor for the matrix A.
+/// \param [in] alpha_type Data type of \p alpha .
+/// \param [in] info Matrix info of the matrix A.
+/// \param [in] val An array containing the non-zero elements of the matrix A.
+/// \param [in] val_type Data type of \p val .
+/// \param [in] row_ptr An array of length \p num_rows + 1.
+/// \param [in] col_ind An array containing the column indices in index-based
+/// numbering.
+/// \param [in] x Data of the vector x.
+/// \param [in] x_type Data type of \p x .
+/// \param [in] beta Scaling factor for the vector x.
+/// \param [in] beta_type Data type of \p beta .
+/// \param [in, out] y Data of the vector y.
+/// \param [in] y_type Data type of \p y .
+inline void csrmv(sycl::queue &queue, oneapi::mkl::transpose trans,
+                  int num_rows, int num_cols, const void *alpha,
+                  library_data_t alpha_type,
+                  const std::shared_ptr<matrix_info> info, const void *val,
+                  library_data_t val_type, const int *row_ptr,
+                  const int *col_ind, const void *x, library_data_t x_type,
+                  const void *beta, library_data_t beta_type, void *y,
+                  library_data_t y_type) {
+  detail::spblas_shim<detail::csrmv_impl>(val_type, queue, trans, num_rows,
+                                          num_cols, alpha, info, val, row_ptr,
+                                          col_ind, x, beta, y);
+}
+
+/// Computes a CSR format sparse matrix-dense matrix product.
+/// C = alpha * op(A) * op(B) + beta * C
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans_a The operation applied to the matrix A.
+/// \param [in] trans_b The operation applied to the matrix B.
+/// \param [in] sparse_rows Number of rows of the matrix A.
+/// \param [in] dense_cols Number of columns of the matrix B or C.
+/// \param [in] sparse_cols Number of columns of the matrix A.
+/// \param [in] alpha Scaling factor for the matrix A.
+/// \param [in] info Matrix info of the matrix A.
+/// \param [in] val An array containing the non-zero elements of the matrix A.
+/// \param [in] row_ptr An array of length \p num_rows + 1.
+/// \param [in] col_ind An array containing the column indices in index-based
+/// numbering.
+/// \param [in] b Data of the matrix B.
+/// \param [in] ldb Leading dimension of the matrix B.
+/// \param [in] beta Scaling factor for the matrix B.
+/// \param [in, out] c Data of the matrix C.
+/// \param [in] ldc Leading dimension of the matrix C.
+template <typename T>
+void csrmm(sycl::queue &queue, oneapi::mkl::transpose trans_a,
+           oneapi::mkl::transpose trans_b, int sparse_rows, int dense_cols,
+           int sparse_cols, const T *alpha,
+           const std::shared_ptr<matrix_info> info, const T *val,
+           const int *row_ptr, const int *col_ind, const T *b, int ldb,
+           const T *beta, T *c, int ldc) {
+#ifndef __INTEL_MKL__
+  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                           "Project does not support this API.");
+#else
+  using Ty = typename dpct::DataType<T>::T2;
+  auto alpha_value =
+      dpct::detail::get_value(reinterpret_cast<const Ty *>(alpha), queue);
+  auto beta_value =
+      dpct::detail::get_value(reinterpret_cast<const Ty *>(beta), queue);
+
+  oneapi::mkl::sparse::matrix_handle_t *sparse_matrix_handle =
+      new oneapi::mkl::sparse::matrix_handle_t;
+  oneapi::mkl::sparse::init_matrix_handle(sparse_matrix_handle);
+  auto data_row_ptr = dpct::detail::get_memory<int>(row_ptr);
+  auto data_col_ind = dpct::detail::get_memory<int>(col_ind);
+  auto data_val = dpct::detail::get_memory<Ty>(val);
+  oneapi::mkl::sparse::set_csr_data(queue, *sparse_matrix_handle, sparse_rows,
+                                    sparse_cols, info->get_index_base(),
+                                    data_row_ptr, data_col_ind, data_val);
+
+  auto data_b = dpct::detail::get_memory<Ty>(b);
+  auto data_c = dpct::detail::get_memory<Ty>(c);
+  sycl::event gemm_event;
+  switch (info->get_matrix_type()) {
+  case matrix_info::matrix_type::ge: {
+#ifndef DPCT_USM_LEVEL_NONE
+    gemm_event =
+#endif
+        oneapi::mkl::sparse::gemm(queue, oneapi::mkl::layout::col_major,
+                                  trans_a, trans_b, alpha_value,
+                                  *sparse_matrix_handle, data_b, dense_cols,
+                                  ldb, beta_value, data_c, ldc);
+    break;
+  }
+  default:
+    throw std::runtime_error(
+        "the csrmm does not support matrix_info::matrix_type::sy, "
+        "matrix_info::matrix_type::tr and matrix_info::matrix_type::he");
+  }
+#ifdef DPCT_USM_LEVEL_NONE
+  queue.wait();
+#endif
+  sycl::event e = oneapi::mkl::sparse::release_matrix_handle(
+      queue, sparse_matrix_handle, {gemm_event});
+  queue.submit([&](sycl::handler &cgh) {
+    cgh.depends_on(e);
+    cgh.host_task([=] { delete sparse_matrix_handle; });
+  });
+#endif
+}
+
+/// Computes a CSR format sparse matrix-dense matrix product.
+/// C = alpha * op(A) * B + beta * C
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans The operation applied to the matrix A.
+/// \param [in] sparse_rows Number of rows of the matrix A.
+/// \param [in] dense_cols Number of columns of the matrix op(B) or C.
+/// \param [in] sparse_cols Number of columns of the matrix A.
+/// \param [in] alpha Scaling factor for the matrix A.
+/// \param [in] info Matrix info of the matrix A.
+/// \param [in] val An array containing the non-zero elements of the matrix A.
+/// \param [in] row_ptr An array of length \p num_rows + 1.
+/// \param [in] col_ind An array containing the column indices in index-based
+/// numbering.
+/// \param [in] b Data of the matrix B.
+/// \param [in] ldb Leading dimension of the matrix B.
+/// \param [in] beta Scaling factor for the matrix B.
+/// \param [in, out] c Data of the matrix C.
+/// \param [in] ldc Leading dimension of the matrix C.
+template <typename T>
+void csrmm(sycl::queue &queue, oneapi::mkl::transpose trans, int sparse_rows,
+           int dense_cols, int sparse_cols, const T *alpha,
+           const std::shared_ptr<matrix_info> info, const T *val,
+           const int *row_ptr, const int *col_ind, const T *b, int ldb,
+           const T *beta, T *c, int ldc) {
+  csrmm<T>(queue, trans, oneapi::mkl::transpose::nontrans, sparse_rows,
+           dense_cols, sparse_cols, alpha, info, val, row_ptr, col_ind, b, ldb,
+           beta, c, ldc);
+}
+
+#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
+/// Saving the optimization information for solving a system of linear
+/// equations.
+class optimize_info {
+public:
+  /// Constructor
+  optimize_info() { oneapi::mkl::sparse::init_matrix_handle(&_matrix_handle); }
+  /// Destructor
+  ~optimize_info() {
+    oneapi::mkl::sparse::release_matrix_handle(get_default_queue(),
+                                               &_matrix_handle, _deps)
+        .wait();
+  }
+  /// Add dependency for the destructor.
+  /// \param [in] e The event which the destructor depends on.
+  void add_dependency(sycl::event e) { _deps.push_back(e); }
+  /// Get the internal saved matrix handle.
+  /// \return Returns the matrix handle.
+  oneapi::mkl::sparse::matrix_handle_t get_matrix_handle() const noexcept {
+    return _matrix_handle;
+  }
+
+private:
+  oneapi::mkl::sparse::matrix_handle_t _matrix_handle = nullptr;
+  std::vector<sycl::event> _deps;
+};
+#endif
+
+#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
+namespace detail {
+#ifdef DPCT_USM_LEVEL_NONE
+#define SPARSE_CALL(CALL, HANDLE) CALL;
+#else
+#define SPARSE_CALL(CALL, HANDLE)                                              \
+  sycl::event e = CALL;                                                        \
+  HANDLE->add_dependency(e);
+#endif
+
+template <typename T> struct optimize_csrsv_impl {
+  void operator()(sycl::queue &queue, oneapi::mkl::transpose trans, int row_col,
+                  const std::shared_ptr<matrix_info> info, const void *val,
+                  const int *row_ptr, const int *col_ind,
+                  std::shared_ptr<optimize_info> optimize_info) {
+    using Ty = typename dpct::DataType<T>::T2;
+    auto data_row_ptr = dpct::detail::get_memory<int>(row_ptr);
+    auto data_col_ind = dpct::detail::get_memory<int>(col_ind);
+    auto data_val = dpct::detail::get_memory<Ty>(val);
+    oneapi::mkl::sparse::set_csr_data(queue, optimize_info->get_matrix_handle(),
+                                      row_col, row_col, info->get_index_base(),
+                                      data_row_ptr, data_col_ind, data_val);
+    if (info->get_matrix_type() != matrix_info::matrix_type::tr)
+      throw std::runtime_error("dpct::sparse::optimize_csrsv_impl()(): "
+                               "oneapi::mkl::sparse::optimize_trsv "
+                               "only accept triangular matrix.");
+    SPARSE_CALL(oneapi::mkl::sparse::optimize_trsv(
+                    queue, info->get_uplo(), trans, info->get_diag(),
+                    optimize_info->get_matrix_handle()),
+                optimize_info);
+  }
+};
+template <typename T> struct csrsv_impl {
+  void operator()(sycl::queue &queue, oneapi::mkl::transpose trans, int row_col,
+                  const void *alpha, const std::shared_ptr<matrix_info> info,
+                  const void *val, const int *row_ptr, const int *col_ind,
+                  std::shared_ptr<optimize_info> optimize_info, const void *x,
+                  void *y) {
+    using Ty = typename dpct::DataType<T>::T2;
+    auto alpha_value =
+        dpct::detail::get_value(static_cast<const Ty *>(alpha), queue);
+    Ty *new_x_ptr = nullptr;
+    if (alpha_value != Ty(1.0f)) {
+      new_x_ptr = (Ty *)dpct::dpct_malloc(row_col * sizeof(Ty));
+      dpct::detail::dpct_memcpy(queue, new_x_ptr, x, row_col * sizeof(Ty),
+                                dpct::memcpy_direction::automatic);
+      auto data_new_x = dpct::detail::get_memory<Ty>(new_x_ptr);
+      oneapi::mkl::blas::column_major::scal(queue, row_col, alpha_value,
+                                            data_new_x, 1);
+    } else {
+      new_x_ptr = const_cast<Ty *>(static_cast<const Ty *>(x));
+    }
+    auto data_new_x = dpct::detail::get_memory<Ty>(new_x_ptr);
+    auto data_y = dpct::detail::get_memory<Ty>(y);
+
+    SPARSE_CALL(oneapi::mkl::sparse::trsv(
+                    queue, info->get_uplo(), trans, info->get_diag(),
+                    optimize_info->get_matrix_handle(), data_new_x, data_y),
+                optimize_info);
+    if (alpha_value != Ty(1.0f)) {
+      dpct::async_dpct_free({new_x_ptr},
+                            {
+#ifndef DPCT_USM_LEVEL_NONE
+                                e
+#endif
+                            },
+                            queue);
+    }
+  }
+};
+} // namespace detail
+
+/// Performs internal optimizations for solving a system of linear equations for
+/// a CSR format sparse matrix.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans The operation applied to the sparse matrix.
+/// \param [in] row_col Number of rows of the sparse matrix.
+/// \param [in] info Matrix info of the sparse matrix.
+/// \param [in] val An array containing the non-zero elements of the sparse matrix.
+/// \param [in] row_ptr An array of length \p num_rows + 1.
+/// \param [in] col_ind An array containing the column indices in index-based
+/// numbering.
+/// \param [out] optimize_info The result of the optimizations.
+template <typename T>
+void optimize_csrsv(sycl::queue &queue, oneapi::mkl::transpose trans,
+                    int row_col, const std::shared_ptr<matrix_info> info,
+                    const T *val, const int *row_ptr, const int *col_ind,
+                    std::shared_ptr<optimize_info> optimize_info) {
+  detail::optimize_csrsv_impl<T>()(queue, trans, row_col, info, val, row_ptr,
+                                   col_ind, optimize_info);
+}
+
+inline void optimize_csrsv(sycl::queue &queue, oneapi::mkl::transpose trans,
+                           int row_col, const std::shared_ptr<matrix_info> info,
+                           const void *val, library_data_t val_type,
+                           const int *row_ptr, const int *col_ind,
+                           std::shared_ptr<optimize_info> optimize_info) {
+  detail::spblas_shim<detail::optimize_csrsv_impl>(
+      val_type, queue, trans, row_col, info, val, row_ptr, col_ind,
+      optimize_info);
+}
+
+template <typename T>
+void csrsv(sycl::queue &queue, oneapi::mkl::transpose trans, int row_col,
+           const T *alpha, const std::shared_ptr<matrix_info> info,
+           const T *val, const int *row_ptr, const int *col_ind,
+           std::shared_ptr<optimize_info> optimize_info, const T *x, T *y) {
+  detail::csrsv_impl<T>()(queue, trans, row_col, alpha, info, val, row_ptr,
+                          col_ind, optimize_info, x, y);
+}
+
+inline void csrsv(sycl::queue &queue, oneapi::mkl::transpose trans, int row_col,
+                  const void *alpha, library_data_t alpha_type,
+                  const std::shared_ptr<matrix_info> info, const void *val,
+                  library_data_t val_type, const int *row_ptr,
+                  const int *col_ind,
+                  std::shared_ptr<optimize_info> optimize_info, const void *x,
+                  library_data_t x_type, void *y, library_data_t y_type) {
+  detail::spblas_shim<detail::csrsv_impl>(val_type, queue, trans, row_col,
+                                          alpha, info, val, row_ptr, col_ind,
+                                          optimize_info, x, y);
+}
+#endif
+
+class sparse_matrix_desc;
+
+using sparse_matrix_desc_t = std::shared_ptr<sparse_matrix_desc>;
+
+/// Structure for describe a dense vector
+class dense_vector_desc {
+public:
+  dense_vector_desc(std::int64_t ele_num, void *value,
+                    library_data_t value_type)
+      : _ele_num(ele_num), _value(value), _value_type(value_type) {}
+  void get_desc(std::int64_t *ele_num, const void **value,
+                library_data_t *value_type) const noexcept {
+    *ele_num = _ele_num;
+    *value = _value;
+    *value_type = _value_type;
+  }
+  void get_desc(std::int64_t *ele_num, void **value,
+                library_data_t *value_type) const noexcept {
+    get_desc(ele_num, const_cast<const void **>(value), value_type);
+  }
+  void *get_value() const noexcept { return _value; }
+  void set_value(void *value) { _value = value; }
+  library_data_t get_value_type() const noexcept { return _value_type; }
+  std::int64_t get_ele_num() const noexcept { return _ele_num; }
+
+private:
+  std::int64_t _ele_num;
+  void *_value;
+  library_data_t _value_type;
+};
+
+/// Structure for describe a dense matrix
+class dense_matrix_desc {
+public:
+  dense_matrix_desc(std::int64_t row_num, std::int64_t col_num,
+                    std::int64_t leading_dim, void *value,
+                    library_data_t value_type, oneapi::mkl::layout layout)
+      : _row_num(row_num), _col_num(col_num), _leading_dim(leading_dim),
+        _value(value), _value_type(value_type), _layout(layout) {}
+  void get_desc(std::int64_t *row_num, std::int64_t *col_num,
+                std::int64_t *leading_dim, void **value,
+                library_data_t *value_type,
+                oneapi::mkl::layout *layout) const noexcept {
+    *row_num = _row_num;
+    *col_num = _col_num;
+    *leading_dim = _leading_dim;
+    *value = _value;
+    *value_type = _value_type;
+    *layout = _layout;
+  }
+  void *get_value() const noexcept { return _value; }
+  void set_value(void *value) { _value = value; }
+  std::int64_t get_col_num() const noexcept { return _col_num; }
+  std::int64_t get_leading_dim() const noexcept { return _leading_dim; }
+  oneapi::mkl::layout get_layout() const noexcept { return _layout; }
+
+private:
+  std::int64_t _row_num;
+  std::int64_t _col_num;
+  std::int64_t _leading_dim;
+  void *_value;
+  library_data_t _value_type;
+  oneapi::mkl::layout _layout;
+};
+
+/// Sparse matrix data format
+enum matrix_format : int {
+  csr = 1,
+};
+
+/// Sparse matrix attribute
+enum matrix_attribute : int { uplo = 0, diag };
+
+#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
+/// Structure for describe a sparse matrix
+class sparse_matrix_desc {
+public:
+  /// Constructor
+  /// \param [out] desc The descriptor to be created
+  /// \param [in] row_num Number of rows of the sparse matrix.
+  /// \param [in] col_num Number of colums of the sparse matrix.
+  /// \param [in] nnz Non-zero elements in the sparse matrix.
+  /// \param [in] row_ptr An array of length \p row_num + 1. If the \p row_ptr is
+  /// NULL, the sparse_matrix_desc will allocate internal memory for it. This
+  /// internal memory can be gotten from get_shadow_row_ptr().
+  /// \param [in] col_ind An array containing the column indices in index-based
+  /// numbering.
+  /// \param [in] value An array containing the non-zero elements of the sparse matrix.
+  /// \param [in] row_ptr_type Data type of the \p row_ptr .
+  /// \param [in] col_ind_type Data type of the \p col_ind .
+  /// \param [in] base Indicates how input arrays are indexed.
+  /// \param [in] value_type Data type of the \p value .
+  /// \param [in] data_format The matrix data format.
+  sparse_matrix_desc(std::int64_t row_num, std::int64_t col_num,
+                     std::int64_t nnz, void *row_ptr, void *col_ind,
+                     void *value, library_data_t row_ptr_type,
+                     library_data_t col_ind_type, oneapi::mkl::index_base base,
+                     library_data_t value_type, matrix_format data_format)
+      : _row_num(row_num), _col_num(col_num), _nnz(nnz), _row_ptr(row_ptr),
+        _col_ind(col_ind), _value(value), _row_ptr_type(row_ptr_type),
+        _col_ind_type(col_ind_type), _base(base), _value_type(value_type),
+        _data_format(data_format) {
+    if (_data_format != matrix_format::csr) {
+      throw std::runtime_error("the sparse matrix data format is unsupported");
+    }
+    oneapi::mkl::sparse::init_matrix_handle(&_matrix_handle);
+    set_data();
+  }
+  /// Destructor
+  ~sparse_matrix_desc() {
+    oneapi::mkl::sparse::release_matrix_handle(get_default_queue(),
+                                               &_matrix_handle, _deps)
+        .wait();
+  }
+
+  /// Add dependency for the destroy method.
+  /// \param [in] e The event which the destroy method depends on.
+  void add_dependency(sycl::event e) { _deps.push_back(e); }
+  /// Get the internal saved matrix handle.
+  /// \return Returns the matrix handle.
+  oneapi::mkl::sparse::matrix_handle_t get_matrix_handle() const noexcept {
+    return _matrix_handle;
+  }
+  /// Get the values saved in the descriptor
+  /// \param [out] row_num Number of rows of the sparse matrix.
+  /// \param [out] col_num Number of colums of the sparse matrix.
+  /// \param [out] nnz Non-zero elements in the sparse matrix.
+  /// \param [out] row_ptr An array of length \p row_num + 1.
+  /// \param [out] col_ind An array containing the column indices in index-based
+  /// numbering.
+  /// \param [out] value An array containing the non-zero elements of the sparse matrix.
+  /// \param [out] row_ptr_type Data type of the \p row_ptr .
+  /// \param [out] col_ind_type Data type of the \p col_ind .
+  /// \param [out] base Indicates how input arrays are indexed.
+  /// \param [out] value_type Data type of the \p value .
+  void get_desc(int64_t *row_num, int64_t *col_num, int64_t *nnz,
+                void **row_ptr, void **col_ind, void **value,
+                library_data_t *row_ptr_type, library_data_t *col_ind_type,
+                oneapi::mkl::index_base *base,
+                library_data_t *value_type) const noexcept {
+    *row_num = _row_num;
+    *col_num = _col_num;
+    *nnz = _nnz;
+    *row_ptr = _row_ptr;
+    *col_ind = _col_ind;
+    *value = _value;
+    *row_ptr_type = _row_ptr_type;
+    *col_ind_type = _col_ind_type;
+    *base = _base;
+    *value_type = _value_type;
+  }
+  /// Get the sparse matrix data format of this descriptor
+  /// \param [out] format The matrix data format result
+  void get_format(matrix_format *data_format) const noexcept {
+    *data_format = _data_format;
+  }
+  /// Get the index base of this descriptor
+  /// \param [out] base The index base result
+  void get_base(oneapi::mkl::index_base *base) const noexcept { *base = _base; }
+  /// Get the value pointer of this descriptor
+  /// \param [out] value The value pointer result
+  void get_value(void **value) const noexcept { *value = _value; }
+  /// Set the value pointer of this descriptor
+  /// \param [in] value The input value pointer
+  void set_value(void *value) {
+    if (!value) {
+      throw std::runtime_error(
+          "dpct::sparse::sparse_matrix_desc::set_value(): The value "
+          "pointer is NULL.");
+    }
+    if (_value && (_value != value)) {
+      throw std::runtime_error(
+          "dpct::sparse::sparse_matrix_desc::set_value(): "
+          "The _value pointer is not NULL. It cannot be reset.");
+    }
+    _value = value;
+    set_data();
+  }
+  /// Get the size of the sparse matrix
+  /// \param [out] row_num Number of rows of the sparse matrix.
+  /// \param [out] col_num Number of colums of the sparse matrix.
+  /// \param [out] nnz Non-zero elements in the sparse matrix.
+  void get_size(int64_t *row_num, int64_t *col_num,
+                int64_t *nnz) const noexcept {
+    *row_num = _row_num;
+    *col_num = _col_num;
+    *nnz = _nnz;
+  }
+  /// Set the sparse matrix attribute
+  /// \param [in] attribute The attribute type
+  /// \param [in] data The attribute value
+  /// \param [in] data_size The data size of the attribute value
+  void set_attribute(matrix_attribute attribute, const void *data,
+                     size_t data_size) {
+    if (attribute == matrix_attribute::diag) {
+      const oneapi::mkl::diag *diag_ptr =
+          reinterpret_cast<const oneapi::mkl::diag *>(data);
+      if (*diag_ptr == oneapi::mkl::diag::unit) {
+        _diag = oneapi::mkl::diag::unit;
+      } else if (*diag_ptr == oneapi::mkl::diag::nonunit) {
+        _diag = oneapi::mkl::diag::nonunit;
+      } else {
+        throw std::runtime_error("unsupported diag value");
+      }
+    } else if (attribute == matrix_attribute::uplo) {
+      const oneapi::mkl::uplo *uplo_ptr =
+          reinterpret_cast<const oneapi::mkl::uplo *>(data);
+      if (*uplo_ptr == oneapi::mkl::uplo::upper) {
+        _uplo = oneapi::mkl::uplo::upper;
+      } else if (*uplo_ptr == oneapi::mkl::uplo::lower) {
+        _uplo = oneapi::mkl::uplo::lower;
+      } else {
+        throw std::runtime_error("unsupported uplo value");
+      }
+    } else {
+      throw std::runtime_error("unsupported attribute");
+    }
+  }
+  /// Get the sparse matrix attribute
+  /// \param [out] attribute The attribute type
+  /// \param [out] data The attribute value
+  /// \param [out] data_size The data size of the attribute value
+  void get_attribute(matrix_attribute attribute, void *data,
+                     size_t data_size) const {
+    if (attribute == matrix_attribute::diag) {
+      oneapi::mkl::diag *diag_ptr = reinterpret_cast<oneapi::mkl::diag *>(data);
+      if (_diag.has_value()) {
+        *diag_ptr = _diag.value();
+      } else {
+        *diag_ptr = oneapi::mkl::diag::nonunit;
+      }
+    } else if (attribute == matrix_attribute::uplo) {
+      oneapi::mkl::uplo *uplo_ptr = reinterpret_cast<oneapi::mkl::uplo *>(data);
+      if (_uplo.has_value()) {
+        *uplo_ptr = _uplo.value();
+      } else {
+        *uplo_ptr = oneapi::mkl::uplo::lower;
+      }
+    } else {
+      throw std::runtime_error("unsupported attribute");
+    }
+  }
+  /// Set the pointers for describing the sparse matrix
+  /// \param [in] row_ptr An array of length \p row_num + 1.
+  /// \param [in] col_ind An array containing the column indices in index-based
+  /// numbering.
+  /// \param [in] value An array containing the non-zero elements of the sparse matrix.
+  void set_pointers(void *row_ptr, void *col_ind, void *value) {
+    if (!row_ptr) {
+      throw std::runtime_error(
+          "dpct::sparse::sparse_matrix_desc::set_pointers(): The "
+          "row_ptr pointer is NULL.");
+    }
+    if (!col_ind) {
+      throw std::runtime_error(
+          "dpct::sparse::sparse_matrix_desc::set_pointers(): The "
+          "col_ind pointer is NULL.");
+    }
+    if (_row_ptr && (_row_ptr != row_ptr)) {
+      throw std::runtime_error("dpct::sparse::sparse_matrix_desc::set_pointers("
+                               "): The _row_ptr pointer is "
+                               "not NULL. It cannot be reset.");
+    }
+    if (_col_ind && (_col_ind != col_ind)) {
+      throw std::runtime_error("dpct::sparse::sparse_matrix_desc::set_pointers("
+                               "): The _col_ind pointer is "
+                               "not NULL. It cannot be reset.");
+    }
+    _row_ptr = row_ptr;
+    _col_ind = col_ind;
+
+    // The descriptor will be updated in the set_value function
+    set_value(value);
+  }
+
+  /// Get the diag attribute
+  /// \return diag value
+  std::optional<oneapi::mkl::diag> get_diag() const noexcept { return _diag; }
+  /// Get the uplo attribute
+  /// \return uplo value
+  std::optional<oneapi::mkl::uplo> get_uplo() const noexcept { return _uplo; }
+  /// Set the number of non-zero elements
+  /// \param nnz [in] The number of non-zero elements.
+  void set_nnz(std::int64_t nnz) noexcept { _nnz = nnz; }
+  /// Get the type of the value pointer.
+  /// \return The type of the value pointer.
+  library_data_t get_value_type() const noexcept { return _value_type; }
+  /// Get the row_ptr.
+  /// \return The row_ptr.
+  void *get_row_ptr() const noexcept { return _row_ptr; }
+  /// If the internal _row_ptr is NULL, the sparse_matrix_desc will allocate
+  /// internal memory for it in the constructor. The internal memory can be gotten
+  /// from this interface.
+  /// \return The shadow row_ptr.
+  void *get_shadow_row_ptr() const noexcept { return _shadow_row_ptr.get(); }
+  /// Get the type of the col_ind pointer.
+  /// \return The type of the col_ind pointer.
+  library_data_t get_col_ind_type() const noexcept { return _col_ind_type; }
+  /// Get the row_num.
+  /// \return The row_num.
+  std::int64_t get_row_num() const noexcept { return _row_num; }
+
+private:
+  inline static const std::function<void(void *)> _shadow_row_ptr_deleter =
+      [](void *ptr) { dpct::dpct_free(ptr); };
+  template <typename index_t, typename value_t> void set_data() {
+    void *row_ptr = nullptr;
+    if (_shadow_row_ptr) {
+      row_ptr = _shadow_row_ptr.get();
+    } else if (_row_ptr) {
+      row_ptr = _row_ptr;
+    } else {
+      row_ptr = dpct::dpct_malloc(sizeof(index_t) * (_row_num + 1),
+                                  get_default_queue());
+      _shadow_row_ptr.reset(row_ptr);
+    }
+#ifdef DPCT_USM_LEVEL_NONE
+    using data_index_t = sycl::buffer<index_t>;
+    using data_value_t = sycl::buffer<value_t>;
+#else
+    using data_index_t = index_t *;
+    using data_value_t = value_t *;
+#endif
+    _data_row_ptr = dpct::detail::get_memory<index_t>(row_ptr);
+    _data_col_ind = dpct::detail::get_memory<index_t>(_col_ind);
+    _data_value = dpct::detail::get_memory<value_t>(_value);
+    oneapi::mkl::sparse::set_csr_data(get_default_queue(), _matrix_handle,
+                                      _row_num, _col_num, _base,
+                                      std::get<data_index_t>(_data_row_ptr),
+                                      std::get<data_index_t>(_data_col_ind),
+                                      std::get<data_value_t>(_data_value));
+    get_default_queue().wait();
+  }
+
+  void set_data() {
+    std::uint64_t key = dpct::detail::get_type_combination_id(
+        _row_ptr_type, _col_ind_type, _value_type);
+    switch (key) {
+    case dpct::detail::get_type_combination_id(library_data_t::real_int32,
+                                               library_data_t::real_int32,
+                                               library_data_t::real_float): {
+      set_data<std::int32_t, float>();
+      break;
+    }
+    case dpct::detail::get_type_combination_id(library_data_t::real_int32,
+                                               library_data_t::real_int32,
+                                               library_data_t::real_double): {
+      set_data<std::int32_t, double>();
+      break;
+    }
+    case dpct::detail::get_type_combination_id(library_data_t::real_int32,
+                                               library_data_t::real_int32,
+                                               library_data_t::complex_float): {
+      set_data<std::int32_t, std::complex<float>>();
+      break;
+    }
+    case dpct::detail::get_type_combination_id(
+        library_data_t::real_int32, library_data_t::real_int32,
+        library_data_t::complex_double): {
+      set_data<std::int32_t, std::complex<double>>();
+      break;
+    }
+    case dpct::detail::get_type_combination_id(library_data_t::real_int64,
+                                               library_data_t::real_int64,
+                                               library_data_t::real_float): {
+      set_data<std::int64_t, float>();
+      break;
+    }
+    case dpct::detail::get_type_combination_id(library_data_t::real_int64,
+                                               library_data_t::real_int64,
+                                               library_data_t::real_double): {
+      set_data<std::int64_t, double>();
+      break;
+    }
+    case dpct::detail::get_type_combination_id(library_data_t::real_int64,
+                                               library_data_t::real_int64,
+                                               library_data_t::complex_float): {
+      set_data<std::int64_t, std::complex<float>>();
+      break;
+    }
+    case dpct::detail::get_type_combination_id(
+        library_data_t::real_int64, library_data_t::real_int64,
+        library_data_t::complex_double): {
+      set_data<std::int64_t, std::complex<double>>();
+      break;
+    }
+    default:
+      throw std::runtime_error("the combination of data type is unsupported");
+    }
+  }
+
+  std::int64_t _row_num;
+  std::int64_t _col_num;
+  std::int64_t _nnz;
+  void *_row_ptr;
+  void *_col_ind;
+  void *_value;
+  library_data_t _row_ptr_type;
+  library_data_t _col_ind_type;
+  oneapi::mkl::index_base _base;
+  library_data_t _value_type;
+  oneapi::mkl::sparse::matrix_handle_t _matrix_handle = nullptr;
+  std::vector<sycl::event> _deps;
+  matrix_format _data_format;
+  std::optional<oneapi::mkl::uplo> _uplo;
+  std::optional<oneapi::mkl::diag> _diag;
+  std::unique_ptr<void, std::function<void(void *)>> _shadow_row_ptr =
+      std::unique_ptr<void, std::function<void(void *)>>(
+          nullptr, _shadow_row_ptr_deleter);
+
+  static constexpr size_t _max_data_variable_size = std::max(
+      {sizeof(sycl::buffer<std::int32_t>), sizeof(sycl::buffer<std::int64_t>),
+       sizeof(sycl::buffer<float>), sizeof(sycl::buffer<double>),
+       sizeof(sycl::buffer<std::complex<float>>),
+       sizeof(sycl::buffer<std::complex<double>>), sizeof(void *)});
+  using index_variant_t =
+      std::variant<std::array<std::byte, _max_data_variable_size>,
+                   sycl::buffer<std::int32_t>, sycl::buffer<std::int64_t>,
+                   std::int32_t *, std::int64_t *>;
+  using value_variant_t =
+      std::variant<std::array<std::byte, _max_data_variable_size>,
+                   sycl::buffer<float>, sycl::buffer<double>,
+                   sycl::buffer<std::complex<float>>,
+                   sycl::buffer<std::complex<double>>, float *, double *,
+                   std::complex<float> *, std::complex<double> *>;
+  index_variant_t _data_row_ptr;
+  index_variant_t _data_col_ind;
+  value_variant_t _data_value;
+};
+
+namespace detail {
+template <typename T> struct spmv_impl {
+  void operator()(sycl::queue queue, oneapi::mkl::transpose trans,
+                  const void *alpha, sparse_matrix_desc_t a,
+                  std::shared_ptr<dense_vector_desc> x, const void *beta,
+                  std::shared_ptr<dense_vector_desc> y) {
+    auto alpha_value =
+        dpct::detail::get_value(reinterpret_cast<const T *>(alpha), queue);
+    auto beta_value =
+        dpct::detail::get_value(reinterpret_cast<const T *>(beta), queue);
+    auto data_x = dpct::detail::get_memory<T>(x->get_value());
+    auto data_y = dpct::detail::get_memory<T>(y->get_value());
+    if (a->get_diag().has_value() && a->get_uplo().has_value()) {
+      oneapi::mkl::sparse::optimize_trmv(queue, a->get_uplo().value(), trans,
+                                         a->get_diag().value(),
+                                         a->get_matrix_handle());
+      SPARSE_CALL(oneapi::mkl::sparse::trmv(queue, a->get_uplo().value(), trans,
+                                            a->get_diag().value(), alpha_value,
+                                            a->get_matrix_handle(), data_x,
+                                            beta_value, data_y),
+                  a);
+    } else {
+      oneapi::mkl::sparse::optimize_gemv(queue, trans, a->get_matrix_handle());
+      SPARSE_CALL(oneapi::mkl::sparse::gemv(queue, trans, alpha_value,
+                                            a->get_matrix_handle(), data_x,
+                                            beta_value, data_y),
+                  a);
+    }
+  }
+};
+
+template <typename T> struct spmm_impl {
+  void operator()(sycl::queue queue, oneapi::mkl::transpose trans_a,
+                  oneapi::mkl::transpose trans_b, const void *alpha,
+                  sparse_matrix_desc_t a, std::shared_ptr<dense_matrix_desc> b,
+                  const void *beta, std::shared_ptr<dense_matrix_desc> c) {
+    auto alpha_value =
+        dpct::detail::get_value(reinterpret_cast<const T *>(alpha), queue);
+    auto beta_value =
+        dpct::detail::get_value(reinterpret_cast<const T *>(beta), queue);
+    auto data_b = dpct::detail::get_memory<T>(b->get_value());
+    auto data_c = dpct::detail::get_memory<T>(c->get_value());
+    SPARSE_CALL(
+        oneapi::mkl::sparse::gemm(queue, b->get_layout(), trans_a, trans_b,
+                                  alpha_value, a->get_matrix_handle(), data_b,
+                                  b->get_col_num(), b->get_leading_dim(),
+                                  beta_value, data_c, c->get_leading_dim()),
+        a);
+  }
+};
+#undef SPARSE_CALL
+} // namespace detail
+
+/// Computes a sparse matrix-dense vector product: y = alpha * op(a) * x + beta * y.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans Specifies operation on input matrix.
+/// \param [in] alpha Specifies the scalar alpha.
+/// \param [in] a Specifies the sparse matrix a.
+/// \param [in] x Specifies the dense vector x.
+/// \param [in] beta Specifies the scalar beta.
+/// \param [in, out] y Specifies the dense vector y.
+/// \param [in] data_type Specifies the data type of \param a, \param x and \param y .
+inline void spmv(sycl::queue queue, oneapi::mkl::transpose trans,
+                 const void *alpha, sparse_matrix_desc_t a,
+                 std::shared_ptr<dense_vector_desc> x, const void *beta,
+                 std::shared_ptr<dense_vector_desc> y,
+                 library_data_t data_type) {
+  detail::spblas_shim<detail::spmv_impl>(data_type, queue, trans, alpha, a, x,
+                                         beta, y);
+}
+
+/// Computes a sparse matrix-dense matrix product: c = alpha * op(a) * op(b) + beta * c.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans_a Specifies operation on input matrix a.
+/// \param [in] trans_b Specifies operation on input matrix b.
+/// \param [in] alpha Specifies the scalar alpha.
+/// \param [in] a Specifies the sparse matrix a.
+/// \param [in] b Specifies the dense matrix b.
+/// \param [in] beta Specifies the scalar beta.
+/// \param [in, out] c Specifies the dense matrix c.
+/// \param [in] data_type Specifies the data type of \param a, \param b and \param c .
+inline void spmm(sycl::queue queue, oneapi::mkl::transpose trans_a,
+                 oneapi::mkl::transpose trans_b, const void *alpha,
+                 sparse_matrix_desc_t a, std::shared_ptr<dense_matrix_desc> b,
+                 const void *beta, std::shared_ptr<dense_matrix_desc> c,
+                 library_data_t data_type) {
+  if (b->get_layout() != c->get_layout())
+    throw std::runtime_error("the layout of b and c are different");
+  detail::spblas_shim<detail::spmm_impl>(data_type, queue, trans_a, trans_b,
+                                         alpha, a, b, beta, c);
+}
+
+namespace detail {
+template <typename T, bool is_host_memory, typename host_memory_t = void>
+struct temp_memory {
+  static_assert(!is_host_memory || !std::is_same_v<host_memory_t, void>,
+                "host_memory_t cannot be void when the input parameter ptr "
+                "points to host memory");
+  temp_memory(sycl::queue queue, void *ptr)
+      : _queue(queue)
+#ifdef DPCT_USM_LEVEL_NONE
+        ,
+        _buffer(is_host_memory ? sycl::buffer<T, 1>(sycl::range<1>(1))
+                               : sycl::buffer<T, 1>(dpct::get_buffer<T>(ptr)))
+#endif
+  {
+    if constexpr (is_host_memory) {
+      _original_host_ptr = static_cast<host_memory_t *>(ptr);
+#ifdef DPCT_USM_LEVEL_NONE
+      auto _buffer_acc = _buffer.get_host_access(sycl::write_only);
+      _buffer_acc[0] = static_cast<T>(*_original_host_ptr);
+#else
+      _memory_ptr = sycl::malloc_host<T>(1, _queue);
+      *_memory_ptr = static_cast<T>(*_original_host_ptr);
+#endif
+    } else {
+#ifndef DPCT_USM_LEVEL_NONE
+      _memory_ptr = static_cast<T *>(ptr);
+#endif
+    }
+  }
+
+  ~temp_memory() {
+    if constexpr (is_host_memory) {
+#ifdef DPCT_USM_LEVEL_NONE
+      auto _buffer_acc = _buffer.get_host_access(sycl::read_only);
+      *_original_host_ptr = static_cast<host_memory_t>(_buffer_acc[0]);
+#else
+      _queue.wait();
+      *_original_host_ptr = *_memory_ptr;
+      sycl::free(_memory_ptr, _queue);
+#endif
+    }
+  }
+  auto get_memory_ptr() {
+#ifdef DPCT_USM_LEVEL_NONE
+    return &_buffer;
+#else
+    return _memory_ptr;
+#endif
+  }
+
+private:
+  sycl::queue _queue;
+  host_memory_t *_original_host_ptr = nullptr;
+#ifdef DPCT_USM_LEVEL_NONE
+  sycl::buffer<T, 1> _buffer;
+#else
+  T *_memory_ptr;
+#endif
+};
+} // namespace detail
+
+/// Do initial estimation of work and load balancing of computing a sparse
+/// matrix-sparse matrix product.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans_a Specifies operation on input matrix a.
+/// \param [in] trans_b Specifies operation on input matrix b.
+/// \param [in] alpha Specifies the scalar alpha.
+/// \param [in] a Specifies the sparse matrix a.
+/// \param [in] b Specifies the sparse matrix b.
+/// \param [in] beta Specifies the scalar beta.
+/// \param [in, out] c Specifies the sparse matrix c.
+/// \param [in] matmat_descr Describes the sparse matrix-sparse matrix operation
+/// to be executed.
+/// \param [in, out] size_temp_buffer Specifies the size of workspace.
+/// \param [in] temp_buffer Specifies the memory of the workspace.
+inline void
+spgemm_work_estimation(sycl::queue queue, oneapi::mkl::transpose trans_a,
+                       oneapi::mkl::transpose trans_b, const void *alpha,
+                       sparse_matrix_desc_t a, sparse_matrix_desc_t b,
+                       const void *beta, sparse_matrix_desc_t c,
+                       oneapi::mkl::sparse::matmat_descr_t matmat_descr,
+                       size_t *size_temp_buffer, void *temp_buffer) {
+  if (temp_buffer) {
+    detail::temp_memory<std::int64_t, true, size_t> size_memory(
+        queue, size_temp_buffer);
+    detail::temp_memory<std::uint8_t, false> work_memory(queue, temp_buffer);
+    oneapi::mkl::sparse::matmat(
+        queue, a->get_matrix_handle(), b->get_matrix_handle(),
+        c->get_matrix_handle(),
+        oneapi::mkl::sparse::matmat_request::work_estimation, matmat_descr,
+        size_memory.get_memory_ptr(), work_memory.get_memory_ptr()
+#ifndef DPCT_USM_LEVEL_NONE
+        , {}
+#endif
+    );
+  } else {
+    oneapi::mkl::sparse::set_matmat_data(
+        matmat_descr, oneapi::mkl::sparse::matrix_view_descr::general, trans_a,
+        oneapi::mkl::sparse::matrix_view_descr::general, trans_b,
+        oneapi::mkl::sparse::matrix_view_descr::general);
+    detail::temp_memory<std::int64_t, true, size_t> size_memory(
+        queue, size_temp_buffer);
+    oneapi::mkl::sparse::matmat(
+        queue, a->get_matrix_handle(), b->get_matrix_handle(),
+        c->get_matrix_handle(),
+        oneapi::mkl::sparse::matmat_request::get_work_estimation_buf_size,
+        matmat_descr, size_memory.get_memory_ptr(), nullptr
+#ifndef DPCT_USM_LEVEL_NONE
+        , {}
+#endif
+    );
+  }
+}
+
+/// Do internal products for computing the C matrix of computing a sparse
+/// matrix-sparse matrix product.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans_a Specifies operation on input matrix a.
+/// \param [in] trans_b Specifies operation on input matrix b.
+/// \param [in] alpha Specifies the scalar alpha.
+/// \param [in] a Specifies the sparse matrix a.
+/// \param [in] b Specifies the sparse matrix b.
+/// \param [in] beta Specifies the scalar beta.
+/// \param [in, out] c Specifies the sparse matrix c.
+/// \param [in] matmat_descr Describes the sparse matrix-sparse matrix operation
+/// to be executed.
+/// \param [in, out] size_temp_buffer Specifies the size of workspace.
+/// \param [in] temp_buffer Specifies the memory of the workspace.
+inline void spgemm_compute(sycl::queue queue, oneapi::mkl::transpose trans_a,
+                           oneapi::mkl::transpose trans_b, const void *alpha,
+                           sparse_matrix_desc_t a, sparse_matrix_desc_t b,
+                           const void *beta, sparse_matrix_desc_t c,
+                           oneapi::mkl::sparse::matmat_descr_t matmat_descr,
+                           size_t *size_temp_buffer, void *temp_buffer) {
+  if (temp_buffer) {
+    std::int64_t nnz_value = 0;
+    {
+      detail::temp_memory<std::int64_t, true, size_t> size_memory(
+          queue, size_temp_buffer);
+      detail::temp_memory<std::uint8_t, false> work_memory(queue, temp_buffer);
+      detail::temp_memory<std::int64_t, true, std::int64_t> nnz_memory(
+          queue, &nnz_value);
+      oneapi::mkl::sparse::matmat(
+          queue, a->get_matrix_handle(), b->get_matrix_handle(),
+          c->get_matrix_handle(), oneapi::mkl::sparse::matmat_request::compute,
+          matmat_descr, size_memory.get_memory_ptr(),
+          work_memory.get_memory_ptr()
+#ifndef DPCT_USM_LEVEL_NONE
+          , {}
+#endif
+      );
+      oneapi::mkl::sparse::matmat(
+          queue, a->get_matrix_handle(), b->get_matrix_handle(),
+          c->get_matrix_handle(), oneapi::mkl::sparse::matmat_request::get_nnz,
+          matmat_descr, nnz_memory.get_memory_ptr(), nullptr
+#ifndef DPCT_USM_LEVEL_NONE
+          , {}
+#endif
+      );
+    }
+    c->set_nnz(nnz_value);
+  } else {
+    detail::temp_memory<std::int64_t, true, size_t> size_memory(
+        queue, size_temp_buffer);
+    oneapi::mkl::sparse::matmat(
+        queue, a->get_matrix_handle(), b->get_matrix_handle(),
+        c->get_matrix_handle(),
+        oneapi::mkl::sparse::matmat_request::get_compute_buf_size, matmat_descr,
+        size_memory.get_memory_ptr(), nullptr
+#ifndef DPCT_USM_LEVEL_NONE
+        , {}
+#endif
+    );
+  }
+}
+
+/// Do any remaining internal products and accumulation and transfer into final
+/// C matrix arrays of computing a sparse matrix-sparse matrix product.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans_a Specifies operation on input matrix a.
+/// \param [in] trans_b Specifies operation on input matrix b.
+/// \param [in] alpha Specifies the scalar alpha.
+/// \param [in] a Specifies the sparse matrix a.
+/// \param [in] b Specifies the sparse matrix b.
+/// \param [in] beta Specifies the scalar beta.
+/// \param [in, out] c Specifies the sparse matrix c.
+/// \param [in] matmat_descr Describes the sparse matrix-sparse matrix operation
+/// to be executed.
+inline void spgemm_finalize(sycl::queue queue, oneapi::mkl::transpose trans_a,
+                            oneapi::mkl::transpose trans_b, const void *alpha,
+                            sparse_matrix_desc_t a, sparse_matrix_desc_t b,
+                            const void *beta, sparse_matrix_desc_t c,
+                            oneapi::mkl::sparse::matmat_descr_t matmat_descr) {
+  oneapi::mkl::sparse::matmat(queue, a->get_matrix_handle(),
+                              b->get_matrix_handle(), c->get_matrix_handle(),
+                              oneapi::mkl::sparse::matmat_request::finalize,
+                              matmat_descr, nullptr, nullptr
+#ifdef DPCT_USM_LEVEL_NONE
+  );
+#else
+  , {}).wait();
+#endif
+  if (c->get_shadow_row_ptr()) {
+    switch (c->get_col_ind_type()) {
+    case library_data_t::real_int32: {
+      dpct::dpct_memcpy(c->get_row_ptr(), c->get_shadow_row_ptr(),
+                        sizeof(std::int32_t) * (c->get_row_num() + 1));
+      break;
+    }
+    case library_data_t::real_int64: {
+      dpct::dpct_memcpy(c->get_row_ptr(), c->get_shadow_row_ptr(),
+                        sizeof(std::int64_t) * (c->get_row_num() + 1));
+      break;
+    }
+    default:
+      throw std::runtime_error("dpct::sparse::spgemm_finalize(): The data type "
+                               "of the col_ind in matrix c is unsupported.");
+    }
+  }
+}
+
+namespace detail {
+template <typename T> struct spsv_impl {
+  void operator()(sycl::queue queue, oneapi::mkl::uplo uplo,
+                  oneapi::mkl::diag diag, oneapi::mkl::transpose trans_a,
+                  const void *alpha, sparse_matrix_desc_t a,
+                  std::shared_ptr<dense_vector_desc> x,
+                  std::shared_ptr<dense_vector_desc> y) {
+    auto alpha_value =
+        dpct::detail::get_value(reinterpret_cast<const T *>(alpha), queue);
+    T *new_x_ptr = nullptr;
+    if (alpha_value != T(1.0f)) {
+      new_x_ptr = (T *)dpct::dpct_malloc(x->get_ele_num() * sizeof(T));
+      dpct::dpct_memcpy(new_x_ptr, x->get_value(),
+                        x->get_ele_num() * sizeof(T));
+      auto data_new_x = dpct::detail::get_memory<T>(new_x_ptr);
+      oneapi::mkl::blas::column_major::scal(queue, x->get_ele_num(),
+                                            alpha_value, data_new_x, 1);
+    } else {
+      new_x_ptr = static_cast<T *>(x->get_value());
+    }
+    auto data_new_x = dpct::detail::get_memory<T>(new_x_ptr);
+    auto data_y = dpct::detail::get_memory<T>(y->get_value());
+    oneapi::mkl::sparse::trsv(queue, uplo, trans_a, diag,
+                              a->get_matrix_handle(), data_new_x, data_y);
+    if (alpha_value != T(1.0f)) {
+      queue.wait();
+      dpct::dpct_free(new_x_ptr);
+    }
+  }
+};
+} // namespace detail
+
+/// Performs internal optimizations for spsv by analyzing the provided matrix
+/// structure and operation parameters.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans_a Specifies operation on input matrix a.
+/// \param [in] a Specifies the sparse matrix a.
+inline void spsv_optimize(sycl::queue queue, oneapi::mkl::transpose trans_a,
+                          sparse_matrix_desc_t a) {
+  if (!a->get_uplo() || !a->get_diag()) {
+    throw std::runtime_error(
+        "dpct::sparse::spsv_optimize(): oneapi::mkl::sparse::optimize_trsv "
+        "needs uplo and diag attributes to be specified.");
+  }
+  oneapi::mkl::sparse::optimize_trsv(
+      queue, a->get_uplo().value(), oneapi::mkl::transpose::nontrans,
+      a->get_diag().value(), a->get_matrix_handle());
+}
+
+/// Solves a system of linear equations for a sparse matrix.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] trans_a Specifies operation on input matrix a.
+/// \param [in] alpha Specifies the scalar alpha.
+/// \param [in] a Specifies the sparse matrix a.
+/// \param [in] x Specifies the dense vector x.
+/// \param [in, out] y Specifies the dense vector y.
+/// \param [in] data_type Specifies the data type of \param a, \param x and
+/// \param y .
+inline void spsv(sycl::queue queue, oneapi::mkl::transpose trans_a,
+                 const void *alpha, sparse_matrix_desc_t a,
+                 std::shared_ptr<dense_vector_desc> x,
+                 std::shared_ptr<dense_vector_desc> y,
+                 library_data_t data_type) {
+  if (!a->get_uplo() || !a->get_diag()) {
+    throw std::runtime_error(
+        "dpct::sparse::spsv(): oneapi::mkl::sparse::trsv needs uplo and diag "
+        "attributes to be specified.");
+  }
+  oneapi::mkl::uplo uplo = a->get_uplo().value();
+  oneapi::mkl::diag diag = a->get_diag().value();
+  detail::spblas_shim<detail::spsv_impl>(a->get_value_type(), queue, uplo, diag,
+                                         trans_a, alpha, a, x, y);
+}
+
+namespace detail {
+template <typename T> struct csr2csc_impl {
+  void operator()(sycl::queue queue, int m, int n, int nnz,
+                  const void *from_val, const int *from_row_ptr,
+                  const int *from_col_ind, void *to_val, int *to_col_ptr,
+                  int *to_row_ind, conversion_scope range,
+                  oneapi::mkl::index_base base) {
+    using Ty = typename dpct::DataType<T>::T2;
+    oneapi::mkl::sparse::matrix_handle_t from_handle = nullptr;
+    oneapi::mkl::sparse::matrix_handle_t to_handle = nullptr;
+    oneapi::mkl::sparse::init_matrix_handle(&from_handle);
+    oneapi::mkl::sparse::init_matrix_handle(&to_handle);
+    auto data_from_row_ptr = dpct::detail::get_memory<int>(from_row_ptr);
+    auto data_from_col_ind = dpct::detail::get_memory<int>(from_col_ind);
+    auto data_from_val = dpct::detail::get_memory<Ty>(from_val);
+    auto data_to_col_ptr = dpct::detail::get_memory<int>(to_col_ptr);
+    auto data_to_row_ind = dpct::detail::get_memory<int>(to_row_ind);
+    void *new_to_value = to_val;
+    if (range == conversion_scope::index) {
+      new_to_value = dpct::dpct_malloc(sizeof(Ty) * nnz);
+    }
+    auto data_to_val = dpct::detail::get_memory<Ty>(new_to_value);
+    oneapi::mkl::sparse::set_csr_data(queue, from_handle, m, n, base,
+                                      data_from_row_ptr, data_from_col_ind,
+                                      data_from_val);
+    oneapi::mkl::sparse::set_csr_data(queue, to_handle, n, m, base,
+                                      data_to_col_ptr, data_to_row_ind,
+                                      data_to_val);
+    sycl::event e1 = oneapi::mkl::sparse::omatcopy(
+        queue, oneapi::mkl::transpose::trans, from_handle, to_handle);
+    oneapi::mkl::sparse::release_matrix_handle(queue, &from_handle, {e1});
+    sycl::event e2 =
+        oneapi::mkl::sparse::release_matrix_handle(queue, &to_handle, {e1});
+    if (range == conversion_scope::index) {
+      dpct::async_dpct_free({new_to_value}, {e2}, queue);
+    }
+  }
+};
+} // namespace detail
+
+/// Convert a CSR sparse matrix to a CSC sparse matrix.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] m Number of rows of the matrix.
+/// \param [in] n Number of columns of the matrix.
+/// \param [in] nnz Number of non-zero elements.
+/// \param [in] from_val An array containing the non-zero elements of the input
+/// matrix.
+/// \param [in] from_row_ptr An array of length \p m + 1.
+/// \param [in] from_col_ind An array containing the column indices in
+/// index-based numbering.
+/// \param [out] to_val An array containing the non-zero elements of the output
+/// matrix.
+/// \param [out] to_col_ptr An array of length \p n + 1.
+/// \param [out] to_row_ind An array containing the row indices in index-based
+/// numbering.
+/// \param [in] range Specifies the conversion scope.
+/// \param [in] base Specifies the index base.
+template <typename T>
+inline void csr2csc(sycl::queue queue, int m, int n, int nnz, const T *from_val,
+                    const int *from_row_ptr, const int *from_col_ind, T *to_val,
+                    int *to_col_ptr, int *to_row_ind, conversion_scope range,
+                    oneapi::mkl::index_base base) {
+  detail::csr2csc_impl<T>()(queue, m, n, nnz, from_val, from_row_ptr,
+                            from_col_ind, to_val, to_col_ptr, to_row_ind, range,
+                            base);
+}
+
+/// Convert a CSR sparse matrix to a CSC sparse matrix.
+/// \param [in] queue The queue where the routine should be executed. It must
+/// have the in_order property when using the USM mode.
+/// \param [in] m Number of rows of the matrix.
+/// \param [in] n Number of columns of the matrix.
+/// \param [in] nnz Number of non-zero elements.
+/// \param [in] from_val An array containing the non-zero elements of the input
+/// matrix.
+/// \param [in] from_row_ptr An array of length \p m + 1.
+/// \param [in] from_col_ind An array containing the column indices in
+/// index-based numbering.
+/// \param [out] to_val An array containing the non-zero elements of the output
+/// matrix.
+/// \param [out] to_col_ptr An array of length \p n + 1.
+/// \param [out] to_row_ind An array containing the row indices in index-based
+/// numbering.
+/// \param [in] value_type Data type of \p from_val and \p to_val .
+/// \param [in] range Specifies the conversion scope.
+/// \param [in] base Specifies the index base.
+inline void csr2csc(sycl::queue queue, int m, int n, int nnz,
+                    const void *from_val, const int *from_row_ptr,
+                    const int *from_col_ind, void *to_val, int *to_col_ptr,
+                    int *to_row_ind, library_data_t value_type,
+                    conversion_scope range, oneapi::mkl::index_base base) {
+  detail::spblas_shim<detail::csr2csc_impl>(
+      value_type, queue, m, n, nnz, from_val, from_row_ptr, from_col_ind,
+      to_val, to_col_ptr, to_row_ind, range, base);
+}
+#endif
+} // namespace sparse
+} // namespace dpct
+
+#endif // __DPCT_SPARSE_UTILS_HPP__
diff --git a/dpct/util.hpp b/dpct/util.hpp
new file mode 100644
index 0000000000000..9af7c924701b7
--- /dev/null
+++ b/dpct/util.hpp
@@ -0,0 +1,1070 @@
+//==---- util.hpp ---------------------------------*- C++ -*----------------==//
+//
+// Copyright (C) Intel Corporation
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __DPCT_UTIL_HPP__
+#define __DPCT_UTIL_HPP__
+
+#include <sycl/sycl.hpp>
+#include <complex>
+#include <type_traits>
+#include <cassert>
+#include <cstdint>
+
+// TODO: Remove these function definitions once they exist in the DPC++ compiler
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
+template <typename T>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT __attribute__((noduplicate))
+T __spirv_GroupNonUniformShuffle(__spv::Scope::Flag, T, unsigned) noexcept;
+
+template <typename T>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT __attribute__((noduplicate))
+T __spirv_GroupNonUniformShuffleDown(__spv::Scope::Flag, T, unsigned) noexcept;
+
+template <typename T>
+__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT __attribute__((noduplicate))
+T __spirv_GroupNonUniformShuffleUp(__spv::Scope::Flag, T, unsigned) noexcept;
+#endif
+
+namespace dpct {
+
+namespace detail {
+
+template <typename tag, typename T> class generic_error_type {
+public:
+  generic_error_type() = default;
+  generic_error_type(T value) : value{value} {}
+  operator T() const { return value; }
+
+private:
+  T value;
+};
+
+} // namespace detail
+
+using err0 = detail::generic_error_type<struct err0_tag, int>;
+using err1 = detail::generic_error_type<struct err1_tag, int>;
+
+template <int... Ints> struct integer_sequence {};
+template <int Size, int... Ints>
+struct make_index_sequence
+    : public make_index_sequence<Size - 1, Size - 1, Ints...> {};
+template <int... Ints>
+struct make_index_sequence<0, Ints...> : public integer_sequence<Ints...> {};
+
+template <typename T> struct DataType { using T2 = T; };
+template <typename T> struct DataType<sycl::vec<T, 2>> {
+  using T2 = std::complex<T>;
+};
+
+inline void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld,
+                            int from_ld, int rows, int cols, int elem_size,
+                            memcpy_direction direction = automatic,
+                            sycl::queue &queue = dpct::get_default_queue(),
+                            bool async = false) {
+  if (to_ptr == from_ptr && to_ld == from_ld) {
+    return;
+  }
+
+  if (to_ld == from_ld) {
+    size_t copy_size = elem_size * ((cols - 1) * (size_t)to_ld + rows);
+    if (async)
+      detail::dpct_memcpy(queue, (void *)to_ptr, (void *)from_ptr,
+                          copy_size, direction);
+    else
+      detail::dpct_memcpy(queue, (void *)to_ptr, (void *)from_ptr,
+                          copy_size, direction).wait();
+  } else {
+    if (async)
+      detail::dpct_memcpy(queue, to_ptr, from_ptr, elem_size * to_ld,
+                          elem_size * from_ld, elem_size * rows, cols,
+                          direction);
+    else
+      sycl::event::wait(detail::dpct_memcpy(
+          queue, to_ptr, from_ptr, elem_size * to_ld, elem_size * from_ld,
+          elem_size * rows, cols, direction));
+  }
+}
+
+/// Copy matrix data. The default leading dimension is column.
+/// \param [out] to_ptr A pointer points to the destination location.
+/// \param [in] from_ptr A pointer points to the source location.
+/// \param [in] to_ld The leading dimension the destination matrix.
+/// \param [in] from_ld The leading dimension the source matrix.
+/// \param [in] rows The number of rows of the source matrix.
+/// \param [in] cols The number of columns of the source matrix.
+/// \param [in] direction The direction of the data copy.
+/// \param [in] queue The queue where the routine should be executed.
+/// \param [in] async If this argument is true, the return of the function
+/// does NOT guarantee the copy is completed.
+template <typename T>
+inline void matrix_mem_copy(T *to_ptr, const T *from_ptr, int to_ld,
+                            int from_ld, int rows, int cols,
+                            memcpy_direction direction = automatic,
+                            sycl::queue &queue = dpct::get_default_queue(),
+                            bool async = false) {
+  using Ty = typename DataType<T>::T2;
+  matrix_mem_copy((void *)to_ptr, (void *)from_ptr, to_ld, from_ld, rows, cols,
+                  sizeof(Ty), direction, queue, async);
+}
+
+/// Cast the high or low 32 bits of a double to an integer.
+/// \param [in] d The double value.
+/// \param [in] use_high32 Cast the high 32 bits of the double if true;
+/// otherwise cast the low 32 bits.
+inline int cast_double_to_int(double d, bool use_high32 = true) {
+  sycl::vec<double, 1> v0{d};
+  auto v1 = v0.as<sycl::int2>();
+  if (use_high32)
+    return v1[1];
+  return v1[0];
+}
+
+/// Combine two integers, the first as the high 32 bits and the second
+/// as the low 32 bits, into a double.
+/// \param [in] high32 The integer as the high 32 bits
+/// \param [in] low32 The integer as the low 32 bits
+inline double cast_ints_to_double(int high32, int low32) {
+  sycl::int2 v0{low32, high32};
+  auto v1 = v0.as<sycl::vec<double, 1>>();
+  return v1;
+}
+
+/// Reverse the bit order of an unsigned integer
+/// \param [in] a Input unsigned integer value
+/// \returns Value of a with the bit order reversed
+template <typename T> inline T reverse_bits(T a) {
+  static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
+                "unsigned integer required");
+  if (!a)
+    return 0;
+  T mask = 0;
+  size_t count = 4 * sizeof(T);
+  mask = ~mask >> count;
+  while (count) {
+    a = ((a & mask) << count) | ((a & ~mask) >> count);
+    count = count >> 1;
+    mask = mask ^ (mask << count);
+  }
+  return a;
+}
+
+/// \param [in] a The first value contains 4 bytes
+/// \param [in] b The second value contains 4 bytes
+/// \param [in] s The selector value, only lower 16bit used
+/// \returns the permutation result of 4 bytes selected in the way
+/// specified by \p s from \p a and \p b
+inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
+                                       unsigned int s) {
+  unsigned int ret;
+  ret =
+      ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
+      (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |
+      (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |
+      (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);
+  return ret;
+}
+
+/// Find position of first least significant set bit in an integer.
+/// ffs(0) returns 0.
+///
+/// \param [in] a Input integer value
+/// \returns The position
+template <typename T> inline int ffs(T a) {
+  static_assert(std::is_integral<T>::value, "integer required");
+  return (sycl::ctz(a) + 1) % (sizeof(T) * 8 + 1);
+}
+
+/// select_from_sub_group allows work-items to obtain a copy of a value held by
+/// any other work-item in the sub_group. The input sub_group will be divided
+/// into several logical sub_groups with id range [0, \p logical_sub_group_size
+/// - 1]. Each work-item in logical sub_group gets value from another work-item
+/// whose id is \p remote_local_id. If \p remote_local_id is outside the
+/// logical sub_group id range, \p remote_local_id will modulo with \p
+/// logical_sub_group_size. The \p logical_sub_group_size must be a power of 2
+/// and not exceed input sub_group size.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] remote_local_id Input source work item id
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id,
+                        int logical_sub_group_size = 32) {
+  unsigned int start_index =
+      g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
+  return sycl::select_from_group(
+      g, x, start_index + remote_local_id % logical_sub_group_size);
+}
+
+/// shift_sub_group_left move values held by the work-items in a sub_group
+/// directly to another work-item in the sub_group, by shifting values a fixed
+/// number of work-items to the left. The input sub_group will be divided into
+/// several logical sub_groups with id range [0, \p logical_sub_group_size - 1].
+/// Each work-item in logical sub_group gets value from another work-item whose
+/// id is caller's id adds \p delta. If calculated id is outside the logical
+/// sub_group id range, the work-item will get value from itself. The \p
+/// logical_sub_group_size must be a power of 2 and not exceed input sub_group
+/// size.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] delta Input delta
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta,
+                       int logical_sub_group_size = 32) {
+  unsigned int id = g.get_local_linear_id();
+  unsigned int end_index =
+      (id / logical_sub_group_size + 1) * logical_sub_group_size;
+  T result = sycl::shift_group_left(g, x, delta);
+  if ((id + delta) >= end_index) {
+    result = x;
+  }
+  return result;
+}
+
+/// shift_sub_group_right move values held by the work-items in a sub_group
+/// directly to another work-item in the sub_group, by shifting values a fixed
+/// number of work-items to the right. The input sub_group will be divided into
+/// several logical_sub_groups with id range [0, \p logical_sub_group_size - 1].
+/// Each work-item in logical_sub_group gets value from another work-item whose
+/// id is caller's id subtracts \p delta. If calculated id is outside the
+/// logical sub_group id range, the work-item will get value from itself. The \p
+/// logical_sub_group_size must be a power of 2 and not exceed input sub_group
+/// size.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] delta Input delta
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta,
+                        int logical_sub_group_size = 32) {
+  unsigned int id = g.get_local_linear_id();
+  unsigned int start_index =
+      id / logical_sub_group_size * logical_sub_group_size;
+  T result = sycl::shift_group_right(g, x, delta);
+  if ((id - start_index) < delta) {
+    result = x;
+  }
+  return result;
+}
+
+/// permute_sub_group_by_xor permutes values by exchanging values held by pairs
+/// of work-items identified by computing the bitwise exclusive OR of the
+/// work-item id and some fixed mask. The input sub_group will be divided into
+/// several logical sub_groups with id range [0, \p logical_sub_group_size - 1].
+/// Each work-item in logical sub_group gets value from another work-item whose
+/// id is bitwise exclusive OR of the caller's id and \p mask. If calculated id
+/// is outside the logical sub_group id range, the work-item will get value from
+/// itself. The \p logical_sub_group_size must be a power of 2 and not exceed
+/// input sub_group size.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] mask Input mask
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
+                           int logical_sub_group_size = 32) {
+  unsigned int id = g.get_local_linear_id();
+  unsigned int start_index =
+      id / logical_sub_group_size * logical_sub_group_size;
+  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
+  return sycl::select_from_group(g, x,
+                                 target_offset < logical_sub_group_size
+                                     ? start_index + target_offset
+                                     : id);
+}
+
+/// The function match_any_over_sub_group conducts a comparison of values
+/// across work-items within a sub-group. match_any_over_sub_group return a mask
+/// in which some bits are set to 1, indicating that the \p value provided by
+/// the work-item represented by these bits are equal. The n-th bit of mask
+/// representing the work-item with id n. The parameter \p member_mask
+/// indicating the work-items participating the call.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] member_mask Input mask
+/// \param [in] value Input value
+/// \returns The result
+template <typename T>
+unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask,
+                                      T value) {
+  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");                    
+  if (!member_mask) {
+    return 0;
+  }
+  unsigned int id = g.get_local_linear_id();
+  unsigned int flag = 0, result = 0, reduce_result = 0;
+  unsigned int bit_index = 0x1 << id;
+  bool is_participate = member_mask & bit_index;
+  T broadcast_value = 0;
+  bool matched = false;
+  while (flag != member_mask) {
+    broadcast_value =
+        sycl::select_from_group(g, value, sycl::ctz((~flag & member_mask)));
+    reduce_result = sycl::reduce_over_group(
+        g, is_participate ? (broadcast_value == value ? bit_index : 0) : 0,
+        sycl::plus<>());
+    flag |= reduce_result;
+    matched = reduce_result & bit_index;
+    result = matched * reduce_result + (1 - matched) * result;
+  }
+  return result;
+}
+
+/// The function match_all_over_sub_group conducts a comparison of values
+/// across work-items within a sub-group. match_all_over_sub_group return \p
+/// member_mask and predicate \p pred will be set to 1 if all \p value that
+/// provided by each work-item in \p member_mask are equal, otherwise return 0
+/// and the predicate \p pred will be set to 0. The n-th bit of \p member_mask
+/// representing the work-item with id n. The parameter \p member_mask
+/// indicating the work-items participating the call.
+/// \tparam T Input value type
+/// \param [in] g Input sub_group
+/// \param [in] member_mask Input mask
+/// \param [in] value Input value
+/// \param [out] pred Output predicate
+/// \returns The result
+template <typename T>
+unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask,
+                                      T value, int *pred) {
+  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type."); 
+  if (!member_mask) {
+    return 0;
+  }
+  unsigned int id = g.get_local_linear_id();
+  unsigned int bit_index = 0x1 << id;
+  bool is_participate = member_mask & bit_index;
+  T broadcast_value = sycl::select_from_group(g, value, sycl::ctz(member_mask));
+  unsigned int reduce_result = sycl::reduce_over_group(
+      g,
+      (member_mask & bit_index) ? (broadcast_value == value ? bit_index : 0)
+                                : 0,
+      sycl::plus<>());
+  bool all_equal = (reduce_result == member_mask);
+  *pred = is_participate & all_equal;
+  return all_equal * member_mask;
+}
+
+namespace experimental {
+#if defined(__NVPTX__) && defined(__SYCL_DEVICE_ONLY__)
+#define SHFL_SYNC(RES, MASK, VAL, SHFL_PARAM, C, SHUFFLE_INSTR)                \
+  if constexpr (std::is_same_v<T, double>) {                                   \
+    int x_a, x_b;                                                              \
+    asm("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(VAL));              \
+    auto tmp_a = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, x_a, SHFL_PARAM, C);   \
+    auto tmp_b = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, x_b, SHFL_PARAM, C);   \
+    asm("mov.b64 %0,{%1,%2};" : "=d"(RES) : "r"(tmp_a), "r"(tmp_b));           \
+  } else if constexpr (std::is_same_v<T, long> ||                              \
+                       std::is_same_v<T, unsigned long>) {                     \
+    int x_a, x_b;                                                              \
+    asm("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(VAL));              \
+    auto tmp_a = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, x_a, SHFL_PARAM, C);   \
+    auto tmp_b = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, x_b, SHFL_PARAM, C);   \
+    asm("mov.b64 %0,{%1,%2};" : "=l"(RES) : "r"(tmp_a), "r"(tmp_b));           \
+  } else if constexpr (std::is_same_v<T, sycl::half>) {                        \
+    short tmp_b16;                                                             \
+    asm("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(VAL));                          \
+    auto tmp_b32 = __nvvm_shfl_sync_##SHUFFLE_INSTR(                           \
+        MASK, static_cast<int>(tmp_b16), SHFL_PARAM, C);                       \
+    asm("mov.b16 %0,%1;" : "=h"(RES) : "h"(static_cast<short>(tmp_b32)));      \
+  } else if constexpr (std::is_same_v<T, float>) {                             \
+    auto tmp_b32 = __nvvm_shfl_sync_##SHUFFLE_INSTR(                           \
+        MASK, __nvvm_bitcast_f2i(VAL), SHFL_PARAM, C);                         \
+    RES = __nvvm_bitcast_i2f(tmp_b32);                                         \
+  } else {                                                                     \
+    RES = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, VAL, SHFL_PARAM, C);          \
+  }
+#endif
+/// Masked version of select_from_sub_group, which execute masked sub-group
+/// operation. The parameter member_mask indicating the work-items participating
+/// the call. Whether the n-th bit is set to 1 representing whether the
+/// work-item with id n is participating the call. All work-items named in
+/// member_mask must be executed with the same member_mask, or the result is
+/// undefined.
+/// \tparam T Input value type
+/// \param [in] member_mask Input mask
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] remote_local_id Input source work item id
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T select_from_sub_group(unsigned int member_mask,
+                        sycl::sub_group g, T x, int remote_local_id,
+                        int logical_sub_group_size = 32) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SPIR__)
+  unsigned int start_index =
+      g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
+  unsigned logical_remote_id =
+      start_index + remote_local_id % logical_sub_group_size;
+  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x, logical_remote_id);
+#elif defined(__NVPTX__)
+  T result;
+  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
+  SHFL_SYNC(result, member_mask, x, remote_local_id, cVal, idx_i32)
+  return result;
+#endif
+#else
+  (void)g;
+  (void)x;
+  (void)remote_local_id;
+  (void)logical_sub_group_size;
+  (void)member_mask;
+  throw sycl::exception(sycl::errc::runtime, "Masked version of select_from_sub_group not "
+                        "supported on host device.");
+#endif // __SYCL_DEVICE_ONLY__
+}
+
+/// Masked version of shift_sub_group_left, which execute masked sub-group
+/// operation. The parameter member_mask indicating the work-items participating
+/// the call. Whether the n-th bit is set to 1 representing whether the
+/// work-item with id n is participating the call. All work-items named in
+/// member_mask must be executed with the same member_mask, or the result is
+/// undefined.
+/// \tparam T Input value type
+/// \param [in] member_mask Input mask
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] delta Input delta
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T shift_sub_group_left(unsigned int member_mask,
+                       sycl::sub_group g, T x, unsigned int delta,
+                       int logical_sub_group_size = 32) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SPIR__)
+  unsigned int id = g.get_local_linear_id();
+  unsigned int end_index =
+      (id / logical_sub_group_size + 1) * logical_sub_group_size;
+  T result = __spirv_GroupNonUniformShuffleDown(__spv::Scope::Subgroup, x, delta);
+  if ((id + delta) >= end_index) {
+    result = x;
+  }
+  return result;
+#elif defined(__NVPTX__)
+  T result;
+  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
+  SHFL_SYNC(result, member_mask, x, delta, cVal, down_i32)
+  return result;
+#endif
+#else
+  (void)g;
+  (void)x;
+  (void)delta;
+  (void)logical_sub_group_size;
+  (void)member_mask;
+  throw sycl::exception(sycl::errc::runtime, "Masked version of select_from_sub_group not "
+                        "supported on host device.");
+#endif // __SYCL_DEVICE_ONLY__
+}
+
+/// Masked version of shift_sub_group_right, which execute masked sub-group
+/// operation. The parameter member_mask indicating the work-items participating
+/// the call. Whether the n-th bit is set to 1 representing whether the
+/// work-item with id n is participating the call. All work-items named in
+/// member_mask must be executed with the same member_mask, or the result is
+/// undefined.
+/// \tparam T Input value type
+/// \param [in] member_mask Input mask
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] delta Input delta
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T shift_sub_group_right(unsigned int member_mask,
+                        sycl::sub_group g, T x, unsigned int delta,
+                        int logical_sub_group_size = 32) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SPIR__)
+  unsigned int id = g.get_local_linear_id();
+  unsigned int start_index =
+      id / logical_sub_group_size * logical_sub_group_size;
+  T result = __spirv_GroupNonUniformShuffleUp(__spv::Scope::Subgroup, x, delta);
+  if ((id - start_index) < delta) {
+    result = x;
+  }
+  return result;
+#elif defined(__NVPTX__)
+  T result;
+  int cVal = ((32 - logical_sub_group_size) << 8);
+  SHFL_SYNC(result, member_mask, x, delta, cVal, up_i32)
+  return result;
+#endif
+#else
+  (void)g;
+  (void)x;
+  (void)delta;
+  (void)logical_sub_group_size;
+  (void)member_mask;
+  throw sycl::exception(sycl::errc::runtime, "Masked version of select_from_sub_group not "
+                        "supported on host device.");
+#endif // __SYCL_DEVICE_ONLY__
+}
+
+/// Masked version of permute_sub_group_by_xor, which execute masked sub-group
+/// operation. The parameter member_mask indicating the work-items participating
+/// the call. Whether the n-th bit is set to 1 representing whether the
+/// work-item with id n is participating the call. All work-items named in
+/// member_mask must be executed with the same member_mask, or the result is
+/// undefined.
+/// \tparam T Input value type
+/// \param [in] member_mask Input mask
+/// \param [in] g Input sub_group
+/// \param [in] x Input value
+/// \param [in] mask Input mask
+/// \param [in] logical_sub_group_size Input logical sub_group size
+/// \returns The result
+template <typename T>
+T permute_sub_group_by_xor(unsigned int member_mask,
+                           sycl::sub_group g, T x, unsigned int mask,
+                           int logical_sub_group_size = 32) {
+#if defined(__SYCL_DEVICE_ONLY__)
+#if defined(__SPIR__)
+  unsigned int id = g.get_local_linear_id();
+  unsigned int start_index =
+      id / logical_sub_group_size * logical_sub_group_size;
+  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
+  unsigned logical_remote_id = (target_offset < logical_sub_group_size) ? start_index + target_offset : id;
+  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x, logical_remote_id);
+#elif defined(__NVPTX__)
+  T result;
+  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
+  SHFL_SYNC(result, member_mask, x, mask, cVal, bfly_i32)
+  return result;
+#endif
+#else
+  (void)g;
+  (void)x;
+  (void)mask;
+  (void)logical_sub_group_size;
+  (void)member_mask;
+  throw sycl::exception(sycl::errc::runtime, "Masked version of select_from_sub_group not "
+                        "supported on host device.");
+#endif // __SYCL_DEVICE_ONLY__
+}
+#if defined(__NVPTX__)
+#undef SHFL_SYNC
+#endif
+} // namespace experimental
+
+/// Computes the multiplication of two complex numbers.
+/// \tparam T Complex element type
+/// \param [in] x The first input complex number
+/// \param [in] y The second input complex number
+/// \returns The result
+template <typename T>
+sycl::vec<T, 2> cmul(sycl::vec<T, 2> x, sycl::vec<T, 2> y) {
+  std::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
+  t1 = t1 * t2;
+  return sycl::vec<T, 2>(t1.real(), t1.imag());
+}
+
+/// Computes the division of two complex numbers.
+/// \tparam T Complex element type
+/// \param [in] x The first input complex number
+/// \param [in] y The second input complex number
+/// \returns The result
+template <typename T>
+sycl::vec<T, 2> cdiv(sycl::vec<T, 2> x, sycl::vec<T, 2> y) {
+  std::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
+  t1 = t1 / t2;
+  return sycl::vec<T, 2>(t1.real(), t1.imag());
+}
+
+/// Computes the magnitude of a complex number.
+/// \tparam T Complex element type
+/// \param [in] x The input complex number
+/// \returns The result
+template <typename T>
+T cabs(sycl::vec<T, 2> x) {
+  std::complex<T> t(x[0], x[1]);
+  return std::abs(t);
+}
+
+/// Computes the complex conjugate of a complex number.
+/// \tparam T Complex element type
+/// \param [in] x The input complex number
+/// \returns The result
+template <typename T>
+sycl::vec<T, 2> conj(sycl::vec<T, 2> x) {
+  std::complex<T> t(x[0], x[1]);
+  t = std::conj(t);
+  return sycl::vec<T, 2>(t.real(), t.imag());
+}
+
+inline int get_sycl_language_version() {
+#ifdef SYCL_LANGUAGE_VERSION
+  return SYCL_LANGUAGE_VERSION;
+#else
+  return 202000;
+#endif
+}
+
+namespace experimental {
+/// Synchronize work items from all work groups within a SYCL kernel.
+/// \param [in] item:  Represents a work group.
+/// \param [in] counter: An atomic object defined on a device memory which can
+/// be accessed by work items in all work groups. The initial value of the
+/// counter should be zero.
+/// Note: Please make sure that all the work items of all work groups within
+/// a SYCL kernel can be scheduled actively at the same time on a device.
+template <int dimensions = 3>
+inline void
+nd_range_barrier(const sycl::nd_item<dimensions> &item,
+                 sycl::atomic_ref<
+                     unsigned int, sycl::memory_order::seq_cst,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space> &counter) {
+
+  static_assert(dimensions == 3, "dimensions must be 3.");
+
+  unsigned int num_groups = item.get_group_range(2) * item.get_group_range(1) *
+                            item.get_group_range(0);
+
+  item.barrier();
+
+  if (item.get_local_linear_id() == 0) {
+    unsigned int inc = 1;
+    unsigned int old_arrive = 0;
+    bool is_group0 =
+        (item.get_group(2) + item.get_group(1) + item.get_group(0) == 0);
+    if (is_group0) {
+      inc = 0x80000000 - (num_groups - 1);
+    }
+
+    old_arrive = counter.fetch_add(inc);
+    // Synchronize all the work groups
+    while (((old_arrive ^ counter.load()) & 0x80000000) == 0)
+      ;
+  }
+
+  item.barrier();
+}
+
+/// Synchronize work items from all work groups within a SYCL kernel.
+/// \param [in] item:  Represents a work group.
+/// \param [in] counter: An atomic object defined on a device memory which can
+/// be accessed by work items in all work groups. The initial value of the
+/// counter should be zero.
+/// Note: Please make sure that all the work items of all work groups within
+/// a SYCL kernel can be scheduled actively at the same time on a device.
+template <>
+inline void
+nd_range_barrier(const sycl::nd_item<1> &item,
+                 sycl::atomic_ref<
+                     unsigned int, sycl::memory_order::seq_cst,
+                     sycl::memory_scope::device,
+                     sycl::access::address_space::global_space> &counter) {
+  unsigned int num_groups = item.get_group_range(0);
+
+  item.barrier();
+
+  if (item.get_local_linear_id() == 0) {
+    unsigned int inc = 1;
+    unsigned int old_arrive = 0;
+    bool is_group0 = (item.get_group(0) == 0);
+    if (is_group0) {
+      inc = 0x80000000 - (num_groups - 1);
+    }
+
+    old_arrive = counter.fetch_add(inc);
+    // Synchronize all the work groups
+    while (((old_arrive ^ counter.load()) & 0x80000000) == 0)
+      ;
+  }
+
+  item.barrier();
+}
+
+/// The logical-group is a logical collection of some work-items within a
+/// work-group.
+/// Note: Please make sure that the logical-group size is a power of 2 in the
+/// range [1, current_sub_group_size].
+template <int dimensions = 3> class logical_group {
+  sycl::nd_item<dimensions> _item;
+  sycl::group<dimensions> _g;
+  uint32_t _logical_group_size;
+  uint32_t _group_linear_range_in_parent;
+
+public:
+  /// Dividing \p parent_group into several logical-groups.
+  /// \param [in] item Current work-item.
+  /// \param [in] parent_group The group to be divided.
+  /// \param [in] size The logical-group size.
+  logical_group(sycl::nd_item<dimensions> item,
+                sycl::group<dimensions> parent_group, uint32_t size)
+      : _item(item), _g(parent_group), _logical_group_size(size) {
+    _group_linear_range_in_parent =
+        (_g.get_local_linear_range() - 1) / _logical_group_size + 1;
+  }
+  logical_group(sycl::nd_item<dimensions> item)
+      : _item(item), _g(item.get_group()) {}
+  /// Returns the index of the work-item within the logical-group.
+  uint32_t get_local_linear_id() const {
+    return _item.get_local_linear_id() % _logical_group_size;
+  }
+  /// Returns the index of the logical-group in the parent group.
+  uint32_t get_group_linear_id() const {
+    return _item.get_local_linear_id() / _logical_group_size;
+  }
+  /// Returns the number of work-items in the logical-group.
+  uint32_t get_local_linear_range() const {
+    if (_g.get_local_linear_range() % _logical_group_size == 0) {
+      return _logical_group_size;
+    }
+    uint32_t last_item_group_id =
+        _g.get_local_linear_range() / _logical_group_size;
+    uint32_t first_of_last_group = last_item_group_id * _logical_group_size;
+    if (_item.get_local_linear_id() >= first_of_last_group) {
+      return _g.get_local_linear_range() - first_of_last_group;
+    } else {
+      return _logical_group_size;
+    }
+  }
+  /// Returns the number of logical-group in the parent group.
+  uint32_t get_group_linear_range() const {
+    return _group_linear_range_in_parent;
+  }
+};
+
+// The original source of the functions calculate_max_active_wg_per_xecore and
+// calculate_max_potential_wg were under the license below:
+//
+// Copyright (C) Intel Corporation
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+// SOFTWARE.
+//
+/// This function is used for occupancy calculation, it computes the max active
+/// work-group number per Xe-Core. Ref to
+/// https://github.com/oneapi-src/oneAPI-samples/tree/master/Tools/GPU-Occupancy-Calculator
+/// \param [out] num_wg Active work-group number.
+/// \param [in] wg_size Work-group size.
+/// \param [in] slm_size Share local memory size.
+/// \param [in] sg_size Sub-group size.
+/// \param [in] used_barrier Whether barrier is used.
+/// \param [in] used_large_grf Whether large General Register File is used.
+/// \return If no error, returns 0.
+/// If \p wg_size exceeds the max work-group size, the max work-group size will
+/// be used instead of \p wg_size and returns -1.
+inline int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size,
+                                              int slm_size = 0,
+                                              int sg_size = 32,
+                                              bool used_barrier = false,
+                                              bool used_large_grf = false) {
+  int ret = 0;
+  const int slm_size_per_xe_core = 64 * 1024;
+  const int max_barrier_registers = 32;
+  dpct::device_ext &dev = dpct::get_current_device();
+
+  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
+  if (wg_size > max_wg_size) {
+    wg_size = max_wg_size;
+    ret = -1;
+  }
+
+  int num_threads_ss = 56;
+  int max_num_wg = 56;
+  if (dev.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice) &&
+      dev.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
+    auto eu_count =
+        dev.get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>();
+    auto threads_count =
+        dev.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
+    num_threads_ss = eu_count * threads_count;
+    max_num_wg = eu_count * threads_count;
+  }
+
+  if (used_barrier) {
+    max_num_wg = max_barrier_registers;
+  }
+
+  // Calculate num_wg_slm
+  int num_wg_slm = 0;
+  if (slm_size == 0) {
+    num_wg_slm = max_num_wg;
+  } else {
+    num_wg_slm = std::floor((float)slm_size_per_xe_core / slm_size);
+  }
+
+  // Calculate num_wg_threads
+  if (used_large_grf)
+    num_threads_ss = num_threads_ss / 2;
+  int num_threads = std::ceil((float)wg_size / sg_size);
+  int num_wg_threads = std::floor((float)num_threads_ss / num_threads);
+
+  // Calculate num_wg
+  *num_wg = std::min(num_wg_slm, num_wg_threads);
+  *num_wg = std::min(*num_wg, max_num_wg);
+  return ret;
+}
+
+/// This function is used for occupancy calculation, it computes the work-group
+/// number and the work-group size which achieves the maximum occupancy of the
+/// device potentially. Ref to
+/// https://github.com/oneapi-src/oneAPI-samples/tree/master/Tools/GPU-Occupancy-Calculator
+/// \param [out] num_wg Work-group number.
+/// \param [out] wg_size Work-group size.
+/// \param [in] max_ws_size_for_device_code The maximum working work-group size
+/// for current device code logic. Zero means no limitation.
+/// \param [in] slm_size Share local memory size.
+/// \param [in] sg_size Sub-group size.
+/// \param [in] used_barrier Whether barrier is used.
+/// \param [in] used_large_grf Whether large General Register File is used.
+/// \return Returns 0.
+inline int calculate_max_potential_wg(int *num_wg, int *wg_size,
+                                      int max_ws_size_for_device_code,
+                                      int slm_size = 0, int sg_size = 32,
+                                      bool used_barrier = false,
+                                      bool used_large_grf = false) {
+  sycl::device &dev = dpct::get_current_device();
+  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
+  if (max_ws_size_for_device_code == 0 ||
+      max_ws_size_for_device_code >= max_wg_size)
+    *wg_size = (int)max_wg_size;
+  else
+    *wg_size = max_ws_size_for_device_code;
+  calculate_max_active_wg_per_xecore(num_wg, *wg_size, slm_size, sg_size,
+                                     used_barrier, used_large_grf);
+  std::uint32_t num_ss = 1;
+  if (dev.has(sycl::aspect::ext_intel_gpu_slices) &&
+      dev.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {
+    num_ss =
+        dev.get_info<sycl::ext::intel::info::device::gpu_slices>() *
+        dev.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
+  }
+  num_wg[0] = num_ss * num_wg[0];
+  return 0;
+}
+
+/// Supported group type during migration.
+enum class group_type { work_group, sub_group, logical_group, root_group };
+
+/// The group_base will dispatch the function call to the specific interface
+/// based on the group type.
+template <int dimensions = 3> class group_base {
+public:
+  group_base(sycl::nd_item<dimensions> item)
+      : nd_item(item), logical_group(item) {}
+  ~group_base() {}
+  /// Returns the number of work-items in the group.
+  size_t get_local_linear_range() {
+    switch (type) {
+    case group_type::work_group:
+      return nd_item.get_group().get_local_linear_range();
+    case group_type::sub_group:
+      return nd_item.get_sub_group().get_local_linear_range();
+    case group_type::logical_group:
+      return logical_group.get_local_linear_range();
+    default:
+      return -1; // Unkonwn group type
+    }
+  }
+  /// Returns the index of the work-item within the group.
+  size_t get_local_linear_id() {
+    switch (type) {
+    case group_type::work_group:
+      return nd_item.get_group().get_local_linear_id();
+    case group_type::sub_group:
+      return nd_item.get_sub_group().get_local_linear_id();
+    case group_type::logical_group:
+      return logical_group.get_local_linear_id();
+    default:
+      return -1; // Unkonwn group type
+    }
+  }
+  /// Wait for all the elements within the group to complete their execution
+  /// before proceeding.
+  void barrier() {
+    switch (type) {
+    case group_type::work_group:
+      sycl::group_barrier(nd_item.get_group());
+      break;
+    case group_type::sub_group:
+    case group_type::logical_group:
+      sycl::group_barrier(nd_item.get_sub_group());
+      break;
+    default:
+      break;
+    }
+  }
+
+protected:
+  logical_group<dimensions> logical_group;
+  sycl::nd_item<dimensions> nd_item;
+  group_type type;
+};
+
+/// The group class is a container type that can storage supported group_type.
+template <typename T, int dimensions = 3>
+class group : public group_base<dimensions> {
+  using group_base<dimensions>::type;
+  using group_base<dimensions>::logical_group;
+
+public:
+  group(T g, sycl::nd_item<dimensions> item) : group_base<dimensions>(item) {
+    if constexpr (std::is_same_v<T, sycl::sub_group>) {
+      type = group_type::sub_group;
+    } else if constexpr (std::is_same_v<T, sycl::group<dimensions>>) {
+      type = group_type::work_group;
+    } else if constexpr (std::is_same_v<T, dpct::experimental::logical_group<
+                                               dimensions>>) {
+      logical_group = g;
+      type = group_type::logical_group;
+    }
+  }
+};
+} // namespace experimental
+
+/// If x <= 2, then return a pointer to the deafult queue;
+/// otherwise, return x reinterpreted as a dpct::queue_ptr.
+inline queue_ptr int_as_queue_ptr(uintptr_t x) {
+  return x <= 2 ?
+  &get_default_queue()
+  : reinterpret_cast<queue_ptr>(x);
+}
+
+template <int n_nondefault_params, int n_default_params, typename T>
+class args_selector;
+
+/// args_selector is a helper class for extracting arguments from an
+/// array of pointers to arguments or buffer of arguments to pass to a
+/// kernel function.
+///
+/// \param R(Ts...) The type of the kernel
+/// \param n_nondefault_params The number of nondefault parameters of the kernel
+/// (excluding parameters that like sycl::nd_item, etc.)
+/// \param n_default_params The number of default parameters of the kernel
+///
+/// Example usage:
+/// With the following kernel:
+///   void foo(sycl::float2 *x, int n, sycl::nd_item<3> item_ct1, float f=.1) {}
+/// and with the declaration:
+///   args_selector<2, 1, decltype(foo)> selector(kernelParams, extra);
+/// we have:
+///   selector.get<0>() returns a reference to sycl::float*,
+///   selector.get<1>() returns a reference to int,
+///   selector.get<2>() returns a reference to float
+template <int n_nondefault_params, int n_default_params,
+   typename R, typename... Ts>
+class args_selector<n_nondefault_params, n_default_params, R(Ts...)> {
+private:
+  void **kernel_params;
+  char *args_buffer;
+
+  template <int i>
+  static constexpr int account_for_default_params() {
+    constexpr int n_total_params = sizeof...(Ts);
+    if constexpr (i >= n_nondefault_params) {
+      return n_total_params - n_default_params + (i - n_nondefault_params);
+    } else {
+      return i;
+    }
+  }    
+
+public:
+  /// Get the type of the ith argument of R(Ts...)
+  /// \param [in] i Index of parameter to get
+  /// \returns Type of ith parameter
+  template <int i>
+  using arg_type = std::tuple_element_t<account_for_default_params<i>(),
+					  std::tuple<Ts...>>;
+private:
+  template <int i>
+  static constexpr int get_offset() {
+    if constexpr (i == 0) {
+      // we can assume args_buffer is properly aligned to the
+      // first argument
+      return 0;
+    } else {
+      constexpr int prev_off = get_offset<i-1>();
+      constexpr int prev_past_end = prev_off + sizeof(arg_type<i-1>);
+      using T = arg_type<i>;
+      // is the past-the-end of the i-1st element properly aligned
+      // with the ith element's alignment?
+      if constexpr (prev_past_end % alignof(T) == 0) {
+	return prev_past_end;
+      }
+      // otherwise bump prev_past_end to match alignment
+      else {
+	return prev_past_end + (alignof(T) - (prev_past_end % alignof(T)));
+      }
+    }
+  }
+
+  static char *get_args_buffer(void **extra) {
+    if (!extra)
+      return nullptr;
+    for (; (std::size_t) *extra != 0; ++extra) {
+      if ((std::size_t) *extra == 1) {
+	return static_cast<char*>(*(extra+1));
+      }
+    }
+    return nullptr;
+  }
+    
+public:
+  /// If kernel_params is nonnull, then args_selector will
+  /// extract arguments from kernel_params. Otherwise, it
+  /// will extract them from extra.
+  /// \param [in] kernel_params Array of pointers to arguments
+  /// a or null pointer.
+  /// \param [in] extra Array containing pointer to argument buffer.
+  args_selector(void **kernel_params, void **extra)
+    : kernel_params(kernel_params),
+      args_buffer(get_args_buffer(extra))
+  {}
+
+  /// Get a reference to the ith argument extracted from kernel_params
+  /// or extra.
+  /// \param [in] i Index of argument to get
+  /// \returns Reference to the ith argument
+  template <int i>    
+  arg_type<i> &get() {
+    if (kernel_params) {
+      return *static_cast<arg_type<i>*>(kernel_params[i]);
+    } else {
+      return *reinterpret_cast<arg_type<i>*>(args_buffer + get_offset<i>());
+    }
+  }
+};
+
+#ifdef _WIN32
+#define DPCT_EXPORT __declspec(dllexport)
+#else
+#define DPCT_EXPORT
+#endif
+
+} // namespace dpct
+
+#endif // __DPCT_UTIL_HPP__
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index e74902c98d5ce..5336ee3d27fdf 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -117,6 +117,11 @@
 
 #include <dpct/lib_common_utils.hpp>
 
+static int g_ggml_sycl_debug=0;
+
+//#define GGML_SYCL_DEBUG(...) (if(g_ggml_sycl_debug) printf(__VA_ARGS__))
+#define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
+
 #define MIN_CC_DP4A   510 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #define CC_VOLTA      700
 #define CC_OFFSET_AMD 1000000
@@ -217,7 +222,7 @@ static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 #if DPCT_COMPAT_RT_VERSION >= 12000
     static const char *cublas_get_error_str(const int err) {
         /*
-        DPCT1009:57: SYCL uses exceptions to report errors and does not use the
+        DPCT1009:48: SYCL uses exceptions to report errors and does not use the
         error codes. The original code was commented out and a warning string
         was inserted. You need to rewrite this code.
         */
@@ -249,13 +254,13 @@ static void ggml_cuda_error(const char * stmt, const char * func, const char * f
 }
 
 /*
-DPCT1001:59: The statement could not be removed.
+DPCT1001:50: The statement could not be removed.
 */
 /*
-DPCT1000:60: Error handling if-stmt was detected but could not be rewritten.
+DPCT1000:51: Error handling if-stmt was detected but could not be rewritten.
 */
 /*
-DPCT1009:61: SYCL uses exceptions to report errors and does not use the error
+DPCT1009:52: SYCL uses exceptions to report errors and does not use the error
 codes. The original code was commented out and a warning string was inserted.
 You need to rewrite this code.
 */
@@ -273,16 +278,16 @@ You need to rewrite this code.
 static const char *cu_get_error_str(int err) {
     const char * err_str;
     /*
-    DPCT1007:58: Migration of cuGetErrorString is not supported.
+    DPCT1007:49: Migration of cuGetErrorString is not supported.
     */
-    cuGetErrorString(err, &err_str);
+    // cuGetErrorString(err, &err_str);
     return err_str;
 }
 /*
-DPCT1001:76: The statement could not be removed.
+DPCT1001:67: The statement could not be removed.
 */
 /*
-DPCT1000:77: Error handling if-stmt was detected but could not be rewritten.
+DPCT1000:68: Error handling if-stmt was detected but could not be rewritten.
 */
 #define CU_CHECK(err)                                                          \
     do { auto err_ = (err);                                                    \
@@ -581,7 +586,7 @@ inline dpct::err0 ggml_cuda_set_device(const int device) try {
     }
 
     /*
-    DPCT1093:62: The "device" device may be not the one intended for use. Adjust
+    DPCT1093:53: The "device" device may be not the one intended for use. Adjust
     the selected device if needed.
     */
     return DPCT_CHECK_ERROR(dpct::select_device(device));
@@ -615,7 +620,8 @@ static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 static void bad_arch(const sycl::stream &stream_ct1) {
     stream_ct1 << "ERROR: ggml-cuda was compiled without support for the "
                   "current GPU architecture.\n";
-    __trap();
+    // __trap();
+    std::exit(1);
 
     (void) bad_arch; // suppress unused function warning
 }
@@ -625,8 +631,8 @@ static __dpct_inline__ float warp_reduce_sum(float x,
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         /*
-        DPCT1096:107: The right-most dimension of the work-group used in the
-        SYCL kernel that calls this function may be less than "32". The function
+        DPCT1096:98: The right-most dimension of the work-group used in the SYCL
+        kernel that calls this function may be less than "32". The function
         "dpct::permute_sub_group_by_xor" may return an unexpected result on the
         CPU device. Modify the size of the work-group to ensure that the value
         of the right-most dimension is a multiple of "32".
@@ -653,8 +659,8 @@ static __dpct_inline__ float warp_reduce_max(float x,
 #pragma unroll
     for (int mask = 16; mask > 0; mask >>= 1) {
         /*
-        DPCT1096:106: The right-most dimension of the work-group used in the
-        SYCL kernel that calls this function may be less than "32". The function
+        DPCT1096:97: The right-most dimension of the work-group used in the SYCL
+        kernel that calls this function may be less than "32". The function
         "dpct::permute_sub_group_by_xor" may return an unexpected result on the
         CPU device. Modify the size of the work-group to ensure that the value
         of the right-most dimension is a multiple of "32".
@@ -993,7 +999,7 @@ static void group_norm_f32(const float * x, float * dst, const int group_size, c
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -1024,7 +1030,7 @@ static void group_norm_f32(const float * x, float * dst, const int group_size, c
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:64: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -2867,11 +2873,11 @@ vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
 template <int mmq_y>
 static __dpct_inline__ void
 allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs, float *tile_x_d) {
+                    int *tile_x_qs_q4_0, float *tile_x_d_q4_0) {
     (void)x_qh; (void)x_sc;
 
-    *x_ql = tile_x_qs;
-    *x_dm = (sycl::half2 *)tile_x_d;
+    *x_ql = tile_x_qs_q4_0;
+    *x_dm = (sycl::half2 *)tile_x_d_q4_0;
 }
 
 template <int mmq_y, int nwarps, bool need_check>
@@ -2969,11 +2975,11 @@ vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
 template <int mmq_y>
 static __dpct_inline__ void
 allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs, sycl::half2 *tile_x_dm) {
+                    int *tile_x_qs_q4_1, sycl::half2 *tile_x_dm_q4_1) {
     (void)x_qh; (void)x_sc;
 
-    *x_ql = tile_x_qs;
-    *x_dm = tile_x_dm;
+    *x_ql = tile_x_qs_q4_1;
+    *x_dm = tile_x_dm_q4_1;
 }
 
 template <int mmq_y, int nwarps, bool need_check>
@@ -3070,11 +3076,11 @@ vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
 template <int mmq_y>
 static __dpct_inline__ void
 allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, float *tile_x_d) {
+                    int *tile_x_ql_q5_0, float *tile_x_d_q5_0) {
     (void)x_qh; (void)x_sc;
 
-    *x_ql = tile_x_ql;
-    *x_dm = (sycl::half2 *)tile_x_d;
+    *x_ql = tile_x_ql_q5_0;
+    *x_dm = (sycl::half2 *)tile_x_d_q5_0;
 }
 
 template <int mmq_y, int nwarps, bool need_check>
@@ -3195,11 +3201,11 @@ vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
 template <int mmq_y>
 static __dpct_inline__ void
 allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm) {
+                    int *tile_x_ql_q5_1, sycl::half2 *tile_x_dm_q5_1) {
     (void)x_qh; (void)x_sc;
 
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
+    *x_ql = tile_x_ql_q5_1;
+    *x_dm = tile_x_dm_q5_1;
 }
 
 template <int mmq_y, int nwarps, bool need_check>
@@ -3311,11 +3317,11 @@ vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
 template <int mmq_y>
 static __dpct_inline__ void
 allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs, float *tile_x_d) {
+                    int *tile_x_qs_q8_0, float *tile_x_d_q8_0) {
     (void)x_qh; (void)x_sc;
 
-    *x_ql = tile_x_qs;
-    *x_dm = (sycl::half2 *)tile_x_d;
+    *x_ql = tile_x_qs_q8_0;
+    *x_dm = (sycl::half2 *)tile_x_d_q8_0;
 }
 
 template <int mmq_y, int nwarps, bool need_check>
@@ -3409,12 +3415,13 @@ vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
 template <int mmq_y>
 static __dpct_inline__ void
 allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+                    int *tile_x_ql_q2_K, sycl::half2 *tile_x_dm_q2_K,
+                    int *tile_x_sc_q2_K) {
     (void)x_qh;
 
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
+    *x_ql = tile_x_ql_q2_K;
+    *x_dm = tile_x_dm_q2_K;
+    *x_sc = tile_x_sc_q2_K;
 }
 
 template <int mmq_y, int nwarps, bool need_check>
@@ -3536,13 +3543,13 @@ vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
 template <int mmq_y>
 static __dpct_inline__ void
 allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh,
-                    int *tile_x_sc) {
+                    int *tile_x_ql_q3_K, sycl::half2 *tile_x_dm_q3_K,
+                    int *tile_x_qh_q3_K, int *tile_x_sc_q3_K) {
 
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_qh = tile_x_qh;
-    *x_sc = tile_x_sc;
+    *x_ql = tile_x_ql_q3_K;
+    *x_dm = tile_x_dm_q3_K;
+    *x_qh = tile_x_qh_q3_K;
+    *x_sc = tile_x_sc_q3_K;
 }
 
 template <int mmq_y, int nwarps, bool need_check>
@@ -3761,12 +3768,13 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
 template <int mmq_y>
 static __dpct_inline__ void
 allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+                    int *tile_x_ql_q4_K, sycl::half2 *tile_x_dm_q4_K,
+                    int *tile_x_sc_q4_K) {
     (void)x_qh;
 
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
+    *x_ql = tile_x_ql_q4_K;
+    *x_dm = tile_x_dm_q4_K;
+    *x_sc = tile_x_sc_q4_K;
 }
 
 template <int mmq_y, int nwarps, bool need_check>
@@ -3949,12 +3957,13 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
 template <int mmq_y>
 static __dpct_inline__ void
 allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+                    int *tile_x_ql_q5_K, sycl::half2 *tile_x_dm_q5_K,
+                    int *tile_x_sc_q5_K) {
     (void)x_qh;
 
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
+    *x_ql = tile_x_ql_q5_K;
+    *x_dm = tile_x_dm_q5_K;
+    *x_sc = tile_x_sc_q5_K;
 }
 
 template <int mmq_y, int nwarps, bool need_check>
@@ -4191,8 +4200,7 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
 }
 
 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
-          int mmq_y, int nwarps, allocate_tiles_cuda_t allocate_tiles,
-          load_tiles_cuda_t load_tiles, int vdr,
+          int mmq_y, int nwarps, load_tiles_cuda_t load_tiles, int vdr,
           vec_dot_q_mul_mat_cuda_t vec_dot>
 /*
 DPCT1110:8: The total declared local variable size in device function mul_mat_q
@@ -4204,8 +4212,8 @@ static __dpct_inline__ void
 mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
           float *__restrict__ dst, const int ncols_x, const int nrows_x,
           const int ncols_y, const int nrows_y, const int nrows_dst,
-          const sycl::nd_item<3> &item_ct1, int *tile_x_ql,
-          sycl::half2 *tile_x_dm, int *tile_x_sc, int *tile_y_qs,
+          int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh,
+          int *tile_x_sc, const sycl::nd_item<3> &item_ct1, int *tile_y_qs,
           sycl::half2 *tile_y_ds) {
 
     const block_q_t  * x = (const block_q_t  *) vx;
@@ -4223,59 +4231,6 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
     const int col_dst_0 = item_ct1.get_group(1) * mmq_x;
     const int & col_y_0 = col_dst_0;
 
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-    /*
-    DPCT1084:11: The function call "allocate_tiles_q4_0" has multiple migration
-    results in different template instantiations that could not be unified. You
-    may need to adjust the code.
-    */
-    /*
-    DPCT1084:12: The function call "allocate_tiles_q4_1" has multiple migration
-    results in different template instantiations that could not be unified. You
-    may need to adjust the code.
-    */
-    /*
-    DPCT1084:13: The function call "allocate_tiles_q5_0" has multiple migration
-    results in different template instantiations that could not be unified. You
-    may need to adjust the code.
-    */
-    /*
-    DPCT1084:14: The function call "allocate_tiles_q5_1" has multiple migration
-    results in different template instantiations that could not be unified. You
-    may need to adjust the code.
-    */
-    /*
-    DPCT1084:15: The function call "allocate_tiles_q8_0" has multiple migration
-    results in different template instantiations that could not be unified. You
-    may need to adjust the code.
-    */
-    /*
-    DPCT1084:16: The function call "allocate_tiles_q2_K" has multiple migration
-    results in different template instantiations that could not be unified. You
-    may need to adjust the code.
-    */
-    /*
-    DPCT1084:17: The function call "allocate_tiles_q3_K" has multiple migration
-    results in different template instantiations that could not be unified. You
-    may need to adjust the code.
-    */
-    /*
-    DPCT1084:18: The function call "allocate_tiles_q4_K" has multiple migration
-    results in different template instantiations that could not be unified. You
-    may need to adjust the code.
-    */
-    /*
-    DPCT1084:19: The function call "allocate_tiles_q5_K" has multiple migration
-    results in different template instantiations that could not be unified. You
-    may need to adjust the code.
-    */
-    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc, tile_x_ql,
-                   tile_x_dm, tile_x_sc);
-
     float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
 
     for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
@@ -4333,7 +4288,7 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
             in converged control flow. You may need to adjust the code.
             */
             /*
-            DPCT1065:65: Consider replacing sycl::nd_item::barrier() with
+            DPCT1065:56: Consider replacing sycl::nd_item::barrier() with
             sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
             better performance if there is no access to global memory.
             */
@@ -4358,7 +4313,7 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
             in converged control flow. You may need to adjust the code.
             */
             /*
-            DPCT1065:66: Consider replacing sycl::nd_item::barrier() with
+            DPCT1065:57: Consider replacing sycl::nd_item::barrier() with
             sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
             better performance if there is no access to global memory.
             */
@@ -4415,8 +4370,12 @@ template <bool need_check> static void
     mul_mat_q4_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4429,30 +4388,33 @@ template <bool need_check> static void
     const int nwarps = NWARPS_Q4_0_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
 
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
         load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
     const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
     const int nwarps = NWARPS_Q4_0_AMPERE;
-
+    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_qs_q4_0, tile_x_d_q4_0);
     mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q4_0<mmq_y>,
               load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ,
               vec_dot_q4_0_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
     const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
     const int nwarps = NWARPS_Q4_0_PASCAL;
 
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
         load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q4_0_q8_1_mul_mat;
     bad_arch();
@@ -4489,8 +4451,12 @@ template <bool need_check> static void
     mul_mat_q4_1(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1,
+    sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4503,30 +4469,31 @@ template <bool need_check> static void
     const int nwarps = NWARPS_Q4_1_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
 
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
         load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
     const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
     const int nwarps = NWARPS_Q4_1_AMPERE;
-
+    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_qs_q4_1, tile_x_dm_q4_1);
     mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q4_1<mmq_y>,
               load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ,
               vec_dot_q4_1_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
     const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
     const int nwarps = NWARPS_Q4_1_PASCAL;
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
         load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q4_1_q8_1_mul_mat;
     bad_arch();
@@ -4561,8 +4528,12 @@ template <bool need_check> static void
     mul_mat_q5_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4574,31 +4545,31 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
     const int nwarps = NWARPS_Q5_0_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
         load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
     const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
     const int nwarps = NWARPS_Q5_0_AMPERE;
-
+    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q5_0, tile_x_d_q5_0);
     mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q5_0<mmq_y>,
               load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ,
               vec_dot_q5_0_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
     const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
     const int nwarps = NWARPS_Q5_0_PASCAL;
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
         load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q5_0_q8_1_mul_mat;
     bad_arch();
@@ -4633,8 +4604,12 @@ template <bool need_check> static void
 mul_mat_q5_1(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1,
+    sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4646,31 +4621,31 @@ mul_mat_q5_1(
     const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
     const int nwarps = NWARPS_Q5_1_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
         load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
     const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
     const int nwarps = NWARPS_Q5_1_AMPERE;
-
+    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q5_1, tile_x_dm_q5_1);
     mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q5_1<mmq_y>,
               load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ,
               vec_dot_q5_1_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
     const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
     const int nwarps = NWARPS_Q5_1_PASCAL;
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
         load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q5_1_q8_1_mul_mat;
     bad_arch();
@@ -4705,8 +4680,12 @@ template <bool need_check> static void
     mul_mat_q8_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4718,31 +4697,31 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
     const int nwarps = NWARPS_Q8_0_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
         load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
     const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
     const int nwarps = NWARPS_Q8_0_AMPERE;
-
+    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_qs_q8_0, tile_x_d_q8_0);
     mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q8_0<mmq_y>,
               load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ,
               vec_dot_q8_0_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
     const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
     const int nwarps = NWARPS_Q8_0_PASCAL;
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
         load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q8_0_q8_1_mul_mat;
     bad_arch();
@@ -4777,8 +4756,13 @@ template <bool need_check> static void
 mul_mat_q2_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K,
+    sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs,
+    sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4790,31 +4774,31 @@ mul_mat_q2_K(
     const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
     const int nwarps = NWARPS_Q2_K_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
         load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
     const int nwarps = NWARPS_Q2_K_AMPERE;
-
+    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K);
     mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q2_K<mmq_y>,
               load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ,
               vec_dot_q2_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
     const int nwarps = NWARPS_Q2_K_PASCAL;
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
         load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q2_K_q8_1_mul_mat;
     bad_arch();
@@ -4851,8 +4835,13 @@ template <bool need_check> static void
     mul_mat_q3_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K,
+    sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K,
+    int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4864,31 +4853,32 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
     const int nwarps = NWARPS_Q3_K_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
         load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
     const int nwarps = NWARPS_Q3_K_AMPERE;
-
+    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K,
+                               tile_x_sc_q3_K);
     mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q3_K<mmq_y>,
               load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ,
               vec_dot_q3_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
     const int nwarps = NWARPS_Q3_K_PASCAL;
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
         load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q3_K_q8_1_mul_mat;
     bad_arch();
@@ -4925,8 +4915,13 @@ template <bool need_check> static void
     mul_mat_q4_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K,
+    sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs,
+    sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -4938,31 +4933,31 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
     const int nwarps = NWARPS_Q4_K_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
         load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
     const int nwarps = NWARPS_Q4_K_AMPERE;
-
+    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K);
     mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q4_K<mmq_y>,
               load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ,
               vec_dot_q4_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
     const int nwarps = NWARPS_Q4_K_PASCAL;
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
         load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q4_K_q8_1_mul_mat;
     bad_arch();
@@ -4997,8 +4992,13 @@ template <bool need_check> static void
 mul_mat_q5_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K,
+    sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs,
+    sycl::half2 *tile_y_ds) {
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -5010,31 +5010,31 @@ mul_mat_q5_K(
     const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
     const int nwarps = NWARPS_Q5_K_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
         load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
     const int nwarps = NWARPS_Q5_K_AMPERE;
-
+    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K);
     mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q5_K<mmq_y>,
               load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ,
               vec_dot_q5_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
     const int nwarps = NWARPS_Q5_K_PASCAL;
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
         load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q5_K_q8_1_mul_mat;
     bad_arch();
@@ -5073,6 +5073,10 @@ template <bool need_check> static void
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
     const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
     int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
+    // int   * tile_x_ql = nullptr;
+    // sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    // int   * tile_x_sc = nullptr;
 
 #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 #if defined(RDNA3) || defined(RDNA2)
@@ -5084,31 +5088,31 @@ template <bool need_check> static void
     const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
     const int nwarps = NWARPS_Q6_K_RDNA1;
 #endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
         load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 
 #elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
     const int nwarps = NWARPS_Q6_K_AMPERE;
-
+    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
+                               tile_x_ql, tile_x_dm, tile_x_sc);
     mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
-              allocate_tiles_q6_K<mmq_y>,
               load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ,
               vec_dot_q6_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1,
-        tile_x_ql, tile_x_dm, tile_x_sc, tile_y_qs, tile_y_ds);
+        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
+        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 
 #elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
     const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
     const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
     const int nwarps = NWARPS_Q6_K_PASCAL;
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
         load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
 #else
     (void) vec_dot_q6_K_q8_1_mul_mat;
     bad_arch();
@@ -5721,7 +5725,7 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
         dst_row[col] = col;
     }
     /*
-    DPCT1065:67: Consider replacing sycl::nd_item::barrier() with
+    DPCT1065:58: Consider replacing sycl::nd_item::barrier() with
     sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
     performance if there is no access to global memory.
     */
@@ -5742,11 +5746,11 @@ static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
                 }
             }
             /*
-            DPCT1118:20: SYCL group functions and algorithms must be encountered
+            DPCT1118:11: SYCL group functions and algorithms must be encountered
             in converged control flow. You may need to adjust the code.
             */
             /*
-            DPCT1065:68: Consider replacing sycl::nd_item::barrier() with
+            DPCT1065:59: Consider replacing sycl::nd_item::barrier() with
             sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
             better performance if there is no access to global memory.
             */
@@ -5798,11 +5802,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
             buf[lane_id] = -INFINITY;
         }
         /*
-        DPCT1118:21: SYCL group functions and algorithms must be encountered in
+        DPCT1118:12: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:69: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -5812,11 +5816,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
             buf[warp_id] = max_val;
         }
         /*
-        DPCT1118:22: SYCL group functions and algorithms must be encountered in
+        DPCT1118:13: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:70: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -5844,11 +5848,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
             buf[lane_id] = 0.f;
         }
         /*
-        DPCT1118:23: SYCL group functions and algorithms must be encountered in
+        DPCT1118:14: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:71: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -5858,11 +5862,11 @@ static void soft_max_f32(const float * x, const float * y, float * dst, const in
             buf[warp_id] = tmp;
         }
         /*
-        DPCT1118:24: SYCL group functions and algorithms must be encountered in
+        DPCT1118:15: SYCL group functions and algorithms must be encountered in
         converged control flow. You may need to adjust the code.
         */
         /*
-        DPCT1065:72: Consider replacing sycl::nd_item::barrier() with
+        DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
         sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
         better performance if there is no access to global memory.
         */
@@ -6132,7 +6136,7 @@ struct bin_bcast_cuda {
                 }
             } else {
                 /*
-                DPCT1049:25: The work-group size passed to the SYCL kernel may
+                DPCT1049:16: The work-group size passed to the SYCL kernel may
                 exceed the limit. To get the device limit, query
                 info::device::max_work_group_size. Adjust the work-group size if
                 needed.
@@ -6275,7 +6279,7 @@ static void norm_f32_cuda(const float *x, float *dst, const int ncols,
     } else {
         const sycl::range<3> block_dims(1, 1, 1024);
         /*
-        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -6320,7 +6324,7 @@ static void group_norm_f32_cuda(const float *x, float *dst,
     } else {
         const sycl::range<3> block_dims(1, 1, 1024);
         /*
-        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -6407,7 +6411,7 @@ static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
     } else {
         const sycl::range<3> block_dims(1, 1, 1024);
         /*
-        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7038,7 +7042,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:29: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:20: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7047,22 +7051,10 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7075,9 +7067,8 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q4_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
+                            tile_x_d_q4_0_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7086,7 +7077,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:30: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:21: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7095,22 +7086,10 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7123,9 +7102,8 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q4_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
+                            tile_x_d_q4_0_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7179,33 +7157,19 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:31: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:22: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        //zjy const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-
         {
             dpct::has_capability_or_fail(stream->get_device(),
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(mmq_y /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   mmq_y /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7218,9 +7182,8 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q4_1<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
+                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7229,7 +7192,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:23: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7238,22 +7201,10 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7266,9 +7217,8 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q4_1<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
+                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7322,7 +7272,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:24: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7331,22 +7281,10 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7359,9 +7297,8 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q5_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
+                            tile_x_d_q5_0_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7370,7 +7307,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:25: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7379,22 +7316,10 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7407,9 +7332,8 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q5_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
+                            tile_x_d_q5_0_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7463,7 +7387,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7472,22 +7396,10 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7500,9 +7412,8 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q5_1<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
+                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7511,7 +7422,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7520,22 +7431,10 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7548,9 +7447,8 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q5_1<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
+                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7604,7 +7502,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7613,22 +7511,10 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7641,9 +7527,8 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q8_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
+                            tile_x_d_q8_0_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7652,7 +7537,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:29: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7661,22 +7546,10 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
                     cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
@@ -7689,9 +7562,8 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q8_0<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
+                            tile_x_d_q8_0_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7745,7 +7617,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:30: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7754,23 +7626,13 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
                     cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -7782,9 +7644,9 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q2_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7793,7 +7655,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:31: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7802,23 +7664,13 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
                     cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -7830,9 +7682,9 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q2_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7888,7 +7740,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7897,23 +7749,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
                     cgh);
+                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -7925,9 +7769,10 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q3_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
+                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -7936,7 +7781,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -7945,23 +7790,15 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
                     cgh);
+                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -7973,9 +7810,10 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q3_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
+                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -8030,7 +7868,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8039,23 +7877,13 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
                     cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -8067,9 +7895,9 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q4_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -8078,7 +7906,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8087,23 +7915,13 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
                     cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -8115,9 +7933,9 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q4_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -8171,7 +7989,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8180,23 +7998,13 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
                     cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -8208,9 +8016,9 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q5_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -8219,7 +8027,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:46: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8228,23 +8036,13 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
                                          {sycl::aspect::fp16});
 
             stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
+                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
+                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
                     cgh);
+                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -8256,9 +8054,9 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
                         mul_mat_q5_K<need_check>(
                             vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
                             nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
+                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
+                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
+                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
                             tile_y_qs_acc_ct1.get_pointer(),
                             tile_y_ds_acc_ct1.get_pointer());
                     });
@@ -8312,7 +8110,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
     if (nrows_x % mmq_y == 0) {
         const bool need_check = false;
         /*
-        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8322,22 +8120,12 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
 
             stream->submit([&](sycl::handler &cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
                     cgh);
                 sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
-                    cgh);
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -8360,7 +8148,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
     } else {
         const bool need_check = true;
         /*
-        DPCT1049:48: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8370,22 +8158,12 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
 
             stream->submit([&](sycl::handler &cgh) {
                 sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (2 * WARP_SIZE) +
-                                   dpct_placeholder /*Fix the type mannually*/),
-                    cgh);
+                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / QI6_K) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       QI6_K),
+                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
                     cgh);
                 sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(dpct_placeholder /*Fix the type mannually*/ *
-                                       (WARP_SIZE / 8) +
-                                   dpct_placeholder /*Fix the type mannually*/ /
-                                       8),
-                    cgh);
+                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
                 sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
                     sycl::range<1>(mmq_x * WARP_SIZE), cgh);
                 sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
@@ -8624,7 +8402,7 @@ static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
     const sycl::range<3> block_nums(1, num_blocks_x, nrows);
     if (pos == nullptr) {
         /*
-        DPCT1049:49: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8640,7 +8418,7 @@ static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
             });
     } else {
         /*
-        DPCT1049:50: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8673,7 +8451,7 @@ static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
 
     if (pos == nullptr) {
         /*
-        DPCT1049:51: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8690,7 +8468,7 @@ static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
             });
     } else {
         /*
-        DPCT1049:52: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8758,7 +8536,7 @@ static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
     const sycl::range<3> block_nums(1, nrows, 1);
     if (order == GGML_SORT_ASC) {
         /*
-        DPCT1049:53: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8769,7 +8547,7 @@ static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
             });
     } else if (order == GGML_SORT_DESC) {
         /*
-        DPCT1049:54: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -8807,13 +8585,13 @@ static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
     const sycl::range<3> block_dims(1, 1, nth);
     const sycl::range<3> block_nums(1, 1, nrows_x);
     /*
-    DPCT1049:55: The work-group size passed to the SYCL kernel may exceed the
+    DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
     limit. To get the device limit, query info::device::max_work_group_size.
     Adjust the work-group size if needed.
     */
     stream->submit([&](sycl::handler &cgh) {
         /*
-        DPCT1101:105: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
+        DPCT1101:96: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
         replaced with a value. Modify the code to use the original expression,
         provided in comments, if it is correct.
         */
@@ -8970,102 +8748,17 @@ catch (sycl::exception const &exc) {
 #if !defined(GGML_USE_HIPBLAS)
 // pool with virtual memory
 /*
-DPCT1082:73: Migration of CUmemGenericAllocationHandle type is not supported.
+DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.
 */
-static std::vector<CUmemGenericAllocationHandle>
-    g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
+// static std::vector<CUmemGenericAllocationHandle>
+//     g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
 static dpct::device_ptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
 static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
 static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
 
 static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-
-    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
-    const size_t alignment = 128;
-    size = alignment * ((size + alignment - 1) / alignment);
 
-    size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
-
-    if (size > avail) {
-        // round up to the next multiple of the granularity
-        size_t reserve_size = size - avail;
-        const size_t granularity = g_device_caps[id].vmm_granularity;
-        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
-
-        GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
-
-        // allocate more physical memory
-        /*
-        DPCT1082:74: Migration of CUmemAllocationProp type is not supported.
-        */
-        CUmemAllocationProp prop = {};
-        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id = id;
-        /*
-        DPCT1082:75: Migration of CUmemGenericAllocationHandle type is not
-        supported.
-        */
-        CUmemGenericAllocationHandle handle;
-        /*
-        DPCT1007:78: Migration of cuMemCreate is not supported.
-        */
-        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
-
-        // reserve virtual address space (if not already reserved)
-        if (g_cuda_pool_addr[id] == 0) {
-            /*
-            DPCT1007:79: Migration of cuMemAddressReserve is not supported.
-            */
-            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id],
-                                         CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
-        }
-
-        // map at the end of the pool
-        /*
-        DPCT1007:80: Migration of cuMemMap is not supported.
-        */
-        CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
-                          reserve_size, 0, handle, 0));
-
-        // set access
-        /*
-        DPCT1082:81: Migration of CUmemAccessDesc type is not supported.
-        */
-        CUmemAccessDesc access = {};
-        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        access.location.id = id;
-        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        /*
-        DPCT1007:82: Migration of cuMemSetAccess is not supported.
-        */
-        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
-                                reserve_size, &access, 1));
-
-        // add to the pool
-        g_cuda_pool_handles[id].push_back(handle);
-        g_cuda_pool_size[id] += reserve_size;
-
-        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
-        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
-        //       (unsigned long long) (reserve_size/1024/1024));
-    }
-
-    GGML_ASSERT(g_cuda_pool_addr[id] != 0);
-
-    void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]);
-    *actual_size = size;
-    g_cuda_pool_used[id] += size;
-
-#ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
-#endif
-
-    return ptr;
+    return NULL;
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -9168,9 +8861,8 @@ static bool g_cublas_loaded = false;
 bool ggml_cublas_loaded(void) {
     return g_cublas_loaded;
 }
-
 void print_devices(){
-    int device_count = dpct::dev_mgr::instance().device_count()
+    int device_count = dpct::dev_mgr::instance().device_count();
     for (int id = 0; id < device_count; ++id) {
         dpct::device_info prop;
         CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
@@ -9181,21 +8873,30 @@ void print_devices(){
     }
 }
 
+int get_sycl_env(const char* env_name, int default_val){
+    char * user_device_string = getenv(env_name);
+    printf("get_sycl_env=%s=%s\n", env_name, user_device_string);
+    int user_number = default_val;
+
+    unsigned n;
+    if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1) {
+            user_number = (int)n;
+        } else {
+            user_number=default_val;
+        }
+    return user_number;
+}
+
 void ggml_init_cublas() try {
     static bool initialized = false;
-
     if (!initialized) {
-        print_devices();
+        g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
 
-        char * user_device_string = getenv("GGML_SYCL_DEVICE");
-        int user_device_number = -1;
+        printf("g_ggml_sycl_debug=%d\n", g_ggml_sycl_debug);
 
-        unsigned n;
-        if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < g_device_count) {
-            user_device_number = (int)n;
-        } else {
-            user_device_number=0;
-        }
+        int user_device_number = get_sycl_env("GGML_SYCL_DEVICE", 0);
+
+        print_devices();
 
         if (DPCT_CHECK_ERROR(g_device_count =
                                  dpct::dev_mgr::instance().device_count()) !=
@@ -9250,6 +8951,8 @@ void ggml_init_cublas() try {
             g_device_caps[id].cc =
                 100 * prop.get_major_version() + 10 * prop.get_minor_version();
 #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+            // g_device_caps[id].cc = 9000;
+            printf("g_device_caps[%d].cc=%d\n", id, g_device_caps[id].cc);
         }
         for (int id = 0; id < g_device_count; ++id) {
             g_tensor_split[id] /= total_vram;
@@ -9293,6 +8996,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
+
 void ggml_cuda_set_tensor_split(const float * tensor_split) {
     if (tensor_split == nullptr) {
         return;
@@ -9326,22 +9030,22 @@ void *ggml_cuda_host_malloc(size_t size) try {
     dpct::err0 err = DPCT_CHECK_ERROR(
         ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
     /*
-    DPCT1000:91: Error handling if-stmt was detected but could not be rewritten.
+    DPCT1000:82: Error handling if-stmt was detected but could not be rewritten.
     */
     if (err != 0) {
         // clear the error
         /*
-        DPCT1026:92: The call to cudaGetLastError was removed because this
+        DPCT1026:83: The call to cudaGetLastError was removed because this
         functionality is redundant in SYCL.
         */
         /*
-        DPCT1001:90: The statement could not be removed.
+        DPCT1001:81: The statement could not be removed.
         */
         fprintf(
             stderr,
             "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
             /*
-            DPCT1009:93: SYCL uses exceptions to report errors and does not use
+            DPCT1009:84: SYCL uses exceptions to report errors and does not use
             the error codes. The original code was commented out and a warning
             string was inserted. You need to rewrite this code.
             */
@@ -9416,10 +9120,10 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
             dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
                 rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
             /*
-            DPCT1001:94: The statement could not be removed.
+            DPCT1001:85: The statement could not be removed.
             */
             /*
-            DPCT1000:95: Error handling if-stmt was detected but could not be
+            DPCT1000:86: Error handling if-stmt was detected but could not be
             rewritten.
             */
             if (r != 0) return r;
@@ -10076,6 +9780,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
 
     if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
+        GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp16 path\n");
         cuda_pool_alloc<sycl::half> src0_as_f16;
         if (src0->type != GGML_TYPE_F16) {
             const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
@@ -10117,6 +9822,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
         to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
     }
     else {
+        GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp32 path\n");
         cuda_pool_alloc<float> src0_ddq_as_f32;
 
         if (src0->type != GGML_TYPE_F32) {
@@ -10397,7 +10103,7 @@ inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
 
     scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
     /*
-    DPCT1010:96: SYCL uses exceptions to report errors and does not use the
+    DPCT1010:87: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
     */
     CUDA_CHECK(0);
@@ -10422,7 +10128,7 @@ inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
 
     clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
     /*
-    DPCT1010:97: SYCL uses exceptions to report errors and does not use the
+    DPCT1010:88: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
     */
     CUDA_CHECK(0);
@@ -10487,7 +10193,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
     // do the computation
     op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
     /*
-    DPCT1010:98: SYCL uses exceptions to report errors and does not use the
+    DPCT1010:89: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
     */
     CUDA_CHECK(0);
@@ -10521,7 +10227,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
 #ifdef NDEBUG
     for (int id = 0; id < g_device_count; ++id) {
         CUDA_CHECK(ggml_cuda_set_device(id));
-        CUDA_CHECK(cudaDeviceSynchronize());
+        // CUDA_CHECK(cudaDeviceSynchronize());
     }
 
     for (int id = 0; id < g_device_count; ++id) {
@@ -10536,14 +10242,14 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
             }
 
             int can_access_peer;
-            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
-            if (can_access_peer) {
-                if (enable_peer_access) {
-                    CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
-                } else {
-                    CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
-                }
-            }
+            // CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            // if (can_access_peer) {
+            //     if (enable_peer_access) {
+            //         CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+            //     } else {
+            //         CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
+            //     }
+            // }
         }
     }
 #endif // NDEBUG
@@ -10678,7 +10384,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
             if (src1_on_device && src1_is_contiguous) {
                 quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
                 /*
-                DPCT1010:99: SYCL uses exceptions to report errors and does not
+                DPCT1010:90: SYCL uses exceptions to report errors and does not
                 use the error codes. The call was replaced with 0. You need to
                 rewrite this code.
                 */
@@ -10699,7 +10405,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
     if (split && used_devices > 1) {
         CUDA_CHECK(ggml_cuda_set_device(g_main_device));
         /*
-        DPCT1024:100: The original code returned the error code that was further
+        DPCT1024:91: The original code returned the error code that was further
         consumed by the program logic. This original code was replaced with 0.
         You may need to rewrite the program logic consuming the error code.
         */
@@ -10776,7 +10482,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
                     quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                     /*
-                    DPCT1010:101: SYCL uses exceptions to report errors and does
+                    DPCT1010:92: SYCL uses exceptions to report errors and does
                     not use the error codes. The call was replaced with 0. You
                     need to rewrite this code.
                     */
@@ -10791,7 +10497,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
                    row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
                 /*
-                DPCT1010:102: SYCL uses exceptions to report errors and does not
+                DPCT1010:93: SYCL uses exceptions to report errors and does not
                 use the error codes. The call was replaced with 0. You need to
                 rewrite this code.
                 */
@@ -10836,7 +10542,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 // add event for the main device to wait on until other device is done
                 if (split && (id != g_main_device || is != 0)) {
                     /*
-                    DPCT1024:103: The original code returned the error code that
+                    DPCT1024:94: The original code returned the error code that
                     was further consumed by the program logic. This original
                     code was replaced with 0. You may need to rewrite the
                     program logic consuming the error code.
@@ -11151,7 +10857,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
     cuda_pool_alloc<sycl::half> dst_f16;
     char * dst_t;
 
-    dpct::library_data_t cu_compute_type = CUBLAS_COMPUTE_16F;
+    dpct::library_data_t cu_compute_type = dpct::library_data_t::real_half;
     dpct::library_data_t cu_data_type = dpct::library_data_t::real_half;
 
     // dst strides
@@ -11175,7 +10881,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
     } else {
         dst_t = (char *) dst_ddf;
 
-        cu_compute_type = CUBLAS_COMPUTE_32F;
+        cu_compute_type = dpct::library_data_t::real_float;
         cu_data_type = dpct::library_data_t::real_float;
 
         alpha = &alpha_f32;
@@ -11230,7 +10936,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
 
         sycl::range<3> block_dims(1, ne12, ne13);
         /*
-        DPCT1049:56: The work-group size passed to the SYCL kernel may exceed
+        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
@@ -11255,7 +10961,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
             });
         }
         /*
-        DPCT1010:104: SYCL uses exceptions to report errors and does not use the
+        DPCT1010:95: SYCL uses exceptions to report errors and does not use the
         error codes. The call was replaced with 0. You need to rewrite this
         code.
         */
@@ -11334,8 +11040,10 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 
             if (use_mul_mat_vec_q) {
                 // NOTE: this kernel does not support ggml_nrows(src1) > 1
+                GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_vec_q path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
             } else {
+                GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_dequantize_mul_mat_vec path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
             }
         } else {
@@ -11346,10 +11054,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
             if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
                 use_mul_mat_q = false;
             }
-
+            // use_mul_mat_q = false;//zjy
             if (use_mul_mat_q) {
+                GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_q path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
             } else {
+                GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_cublas path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
             }
         }
diff --git a/ggml-sycl.cpp.base b/ggml-sycl.cpp.base
new file mode 100644
index 0000000000000..ab7af226f6891
--- /dev/null
+++ b/ggml-sycl.cpp.base
@@ -0,0 +1,11951 @@
+#define DPCT_COMPAT_RT_VERSION 12010
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <algorithm>
+#include <assert.h>
+#include <atomic>
+#include <cinttypes>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <float.h>
+#include <limits>
+#include <stdint.h>
+#include <stdio.h>
+#include <vector>
+#include <dpct/blas_utils.hpp>
+
+#if defined(GGML_USE_HIPBLAS)
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hip/hip_fp16.h>
+#ifdef __HIP_PLATFORM_AMD__
+// for rocblas_initialize()
+#include "rocblas/rocblas.h"
+#endif // __HIP_PLATFORM_AMD__
+#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
+#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
+#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
+#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
+#define CUBLAS_OP_N HIPBLAS_OP_N
+#define CUBLAS_OP_T HIPBLAS_OP_T
+#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
+#define CUBLAS_TF32_TENSOR_OP_MATH 0
+#define CUDA_R_16F  HIPBLAS_R_16F
+#define CUDA_R_32F  HIPBLAS_R_32F
+#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
+#define cublasCreate hipblasCreate
+#define cublasGemmEx hipblasGemmEx
+#define cublasGemmBatchedEx hipblasGemmBatchedEx
+#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
+#define cublasHandle_t hipblasHandle_t
+#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
+#define cublasSetStream hipblasSetStream
+#define cublasSgemm hipblasSgemm
+#define cublasStatus_t hipblasStatus_t
+#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
+#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
+#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
+#define cudaDeviceProp hipDeviceProp_t
+#define cudaDeviceSynchronize hipDeviceSynchronize
+#define cudaError_t hipError_t
+#define cudaEventCreateWithFlags hipEventCreateWithFlags
+#define cudaEventDisableTiming hipEventDisableTiming
+#define cudaEventRecord hipEventRecord
+#define cudaEvent_t hipEvent_t
+#define cudaEventDestroy hipEventDestroy
+#define cudaFree hipFree
+#define cudaFreeHost hipHostFree
+#define cudaGetDevice hipGetDevice
+#define cudaGetDeviceCount hipGetDeviceCount
+#define cudaGetDeviceProperties hipGetDeviceProperties
+#define cudaGetErrorString hipGetErrorString
+#define cudaGetLastError hipGetLastError
+#define cudaMalloc hipMalloc
+#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMemcpy hipMemcpy
+#define cudaMemcpy2DAsync hipMemcpy2DAsync
+#define cudaMemcpyAsync hipMemcpyAsync
+#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
+#define cudaMemcpyKind hipMemcpyKind
+#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
+#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
+#define cudaSetDevice hipSetDevice
+#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
+#define cudaStreamFireAndForget hipStreamFireAndForget
+#define cudaStreamNonBlocking hipStreamNonBlocking
+#define cudaStreamSynchronize hipStreamSynchronize
+#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaStream_t hipStream_t
+#define cudaSuccess hipSuccess
+#else
+#endif // defined(GGML_USE_HIPBLAS)
+
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
+#include <cmath>
+
+#include <dpct/lib_common_utils.hpp>
+
+#include <chrono>
+
+#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define CC_VOLTA      700
+#define CC_OFFSET_AMD 1000000
+#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
+
+#define GGML_CUDA_MAX_NODES 8192
+
+// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
+// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
+// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
+// -  7B quantum model: +100-200 MB
+// - 13B quantum model: +200-400 MB
+//
+//#define GGML_CUDA_FORCE_MMQ
+
+// TODO: improve this to be correct for more hardware
+//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
+//       probably other such cases, and not sure what happens on AMD hardware
+#if !defined(GGML_CUDA_FORCE_MMQ)
+#define CUDA_USE_TENSOR_CORES
+#endif
+
+// max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 32
+
+#if defined(GGML_USE_HIPBLAS)
+#define __CUDA_ARCH__ 1300
+
+#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
+    defined(__gfx1150__) || defined(__gfx1151__)
+#define RDNA3
+#endif
+
+#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
+    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
+#define RDNA2
+#endif
+
+#ifndef __has_builtin
+    #define __has_builtin(x) 0
+#endif
+
+typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
+static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+#if __has_builtin(__builtin_elementwise_sub_sat)
+    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
+    return reinterpret_cast<const int&>(c);
+#else
+    int8x4_t c;
+    int16_t tmp;
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+        tmp = va[i] - vb[i];
+        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
+        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
+        c[i] = tmp;
+    }
+    return reinterpret_cast<int&>(c);
+#endif // __has_builtin(__builtin_elementwise_sub_sat)
+}
+
+static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
+#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
+    c = __builtin_amdgcn_sdot4(a, b, c, false);
+#elif defined(__gfx1100__)
+    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
+#elif defined(__gfx1010__) || defined(__gfx900__)
+    int tmp1;
+    int tmp2;
+    asm("\n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
+        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
+        v_add3_u32 %0, %1, %2, %0 \n \
+        "
+        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
+        : "v"(a), "v"(b)
+    );
+#else
+    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
+    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
+    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
+#endif
+    return c;
+}
+#endif // defined(GGML_USE_HIPBLAS)
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+/*
+DPCT1001:63: The statement could not be removed.
+*/
+/*
+DPCT1000:64: Error handling if-stmt was detected but could not be rewritten.
+*/
+/*
+DPCT1009:65: SYCL uses exceptions to report errors and does not use the error
+codes. The original code was commented out and a warning string was inserted.
+You need to rewrite this code.
+*/
+#define CUDA_CHECK(err)                                                              \
+    do {                                                                             \
+        dpct::err0 err_ = (err);                                                     \
+        if (err_ != 0) {                                                             \
+            int id;                                                                  \
+            id = dpct::dev_mgr::instance().current_device_id();                      \
+            fprintf(                                                                 \
+                stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__,            \
+                __LINE__,                                                            \
+                "cudaGetErrorString is not supported" /*cudaGetErrorString(err_)*/); \
+            fprintf(stderr, "current device: %d\n", id);                             \
+            GGML_ASSERT(!"CUDA error");                                              \
+        }                                                                            \
+    } while (0)
+
+#if DPCT_COMPAT_RT_VERSION >= 12000
+/*
+DPCT1009:106: SYCL uses exceptions to report errors and does not use the error
+codes. The original code was commented out and a warning string was inserted.
+You need to rewrite this code.
+*/
+#define CUBLAS_CHECK(err)                                                                  \
+    do {                                                                                   \
+        int err_ = (err);                                                                  \
+        if (err_ != 0) {                                                                   \
+            int id;                                                                        \
+            id = dpct::dev_mgr::instance().current_device_id();                            \
+            fprintf(                                                                       \
+                stderr, "\ncuBLAS error %d at %s:%d: %s\n", err_, __FILE__,                \
+                __LINE__,                                                                  \
+                "cublasGetStatusString is not supported" /*cublasGetStatusString(err_)*/); \
+            fprintf(stderr, "current device: %d\n", id);                                   \
+            GGML_ASSERT(!"cuBLAS error");                                                  \
+        }                                                                                  \
+    } while (0)
+#else
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        cublasStatus_t err_ = (err);                                                    \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
+            int id;                                                                     \
+            cudaGetDevice(&id);                                                         \
+            fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);  \
+            fprintf(stderr, "current device: %d\n", id);                                \
+            GGML_ASSERT(!"cuBLAS error");                                               \
+        }                                                                               \
+    } while (0)
+#endif // CUDART_VERSION >= 11
+
+#if DPCT_COMPAT_RT_VERSION >= 11100
+#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
+#else
+#define GGML_CUDA_ASSUME(x)
+#endif // CUDART_VERSION >= 11100
+
+#ifdef GGML_CUDA_F16
+typedef half dfloat; // dequantize float
+typedef half2 dfloat2;
+#else
+typedef float dfloat; // dequantize float
+typedef sycl::float2 dfloat2;
+#endif //GGML_CUDA_F16
+
+static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __dpct_inline__ int get_int_from_uint8(const uint8_t *x8,
+                                              const int &i32) {
+    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
+
+    int x32 = 0;
+    x32 |= x16[0] <<  0;
+    x32 |= x16[1] << 16;
+
+    return x32;
+}
+
+static __dpct_inline__ int get_int_from_int8_aligned(const int8_t *x8,
+                                                     const int &i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8,
+                                                      const int &i32) {
+    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
+}
+
+template <typename T>
+using to_t_cuda_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
+                             int k, dpct::queue_ptr stream);
+typedef to_t_cuda_t<float> to_fp32_cuda_t;
+typedef to_t_cuda_t<sycl::half> to_fp16_cuda_t;
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
+typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
+typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
+typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_cuda_op_mul_mat_t)(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream);
+typedef void (*ggml_cuda_op_flatten_t)(const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst, const float *src0_dd,
+                                       const float *src1_dd, float *dst_dd,
+                                       const dpct::queue_ptr &main_stream);
+
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization
+// QI = number of 32 bit integers before dequantization
+
+#define QK4_0 32
+#define QR4_0 2
+#define QI4_0 (QK4_0 / (4 * QR4_0))
+typedef struct dpct_type_666762 {
+    sycl::half d;           // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+#define QR4_1 2
+#define QI4_1 (QK4_1 / (4 * QR4_1))
+typedef struct dpct_type_237623 {
+    sycl::half2 dm;         // dm.x = delta, dm.y = min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+#define QR5_0 2
+#define QI5_0 (QK5_0 / (4 * QR5_0))
+typedef struct dpct_type_447594 {
+    sycl::half d;           // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+#define QR5_1 2
+#define QI5_1 (QK5_1 / (4 * QR5_1))
+typedef struct dpct_type_150249 {
+    sycl::half2 dm;         // dm.x = delta, dm.y = min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+#define QR8_0 1
+#define QI8_0 (QK8_0 / (4 * QR8_0))
+typedef struct dpct_type_281565 {
+    sycl::half d;           // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
+
+#define QK8_1 32
+#define QR8_1 1
+#define QI8_1 (QK8_1 / (4 * QR8_1))
+typedef struct dpct_type_126688 {
+    sycl::half2 ds;         // ds.x = delta, ds.y = sum
+    int8_t  qs[QK8_0];      // quants
+} block_q8_1;
+static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
+
+typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef void (*allocate_tiles_cuda_t)(int **x_ql, sycl::half2 **x_dm,
+                                      int **x_qh, int **x_sc);
+typedef void (*load_tiles_cuda_t)(const void *__restrict__ vx,
+                                  int *__restrict__ x_ql,
+                                  sycl::half2 *__restrict__ x_dm,
+                                  int *__restrict__ x_qh,
+                                  int *__restrict__ x_sc, const int &i_offset,
+                                  const int &i_max, const int &k,
+                                  const int &blocks_per_row);
+typedef float (*vec_dot_q_mul_mat_cuda_t)(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
+    const int &i, const int &j, const int &k);
+
+//================================= k-quants
+
+#ifdef GGML_QKK_64
+#define QK_K 64
+#define K_SCALE_SIZE 4
+#else
+#define QK_K 256
+#define K_SCALE_SIZE 12
+#endif
+
+#define QR2_K 4
+#define QI2_K (QK_K / (4*QR2_K))
+typedef struct dpct_type_141842 {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    sycl::half2 dm;          // super-block scale for quantized scales/mins
+} block_q2_K;
+static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
+
+#define QR3_K 4
+#define QI3_K (QK_K / (4*QR3_K))
+typedef struct dpct_type_116731 {
+    uint8_t hmask[QK_K/8];     // quants - high bit
+    uint8_t qs[QK_K/4];        // quants - low 2 bits
+#ifdef GGML_QKK_64
+    uint8_t scales[2]; // scales, quantized with 8 bits
+#else
+    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
+#endif
+    sycl::half d; // super-block scale
+} block_q3_K;
+//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
+
+#define QR4_K 2
+#define QI4_K (QK_K / (4*QR4_K))
+#ifdef GGML_QKK_64
+typedef struct {
+    half    dm[2];             // super-block scales/mins
+    uint8_t scales[2];         // 4-bit block scales/mins
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
+#else
+typedef struct dpct_type_801816 {
+    sycl::half2 dm;            // super-block scale for quantized scales/mins
+    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
+    uint8_t qs[QK_K/2];        // 4--bit quants
+} block_q4_K;
+static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
+#endif
+
+#define QR5_K 2
+#define QI5_K (QK_K / (4*QR5_K))
+#ifdef GGML_QKK_64
+typedef struct {
+    half d;                  // super-block scale
+    int8_t scales[QK_K/16];  // block scales
+    uint8_t qh[QK_K/8];      // quants, high bit
+    uint8_t qs[QK_K/2];      // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
+#else
+typedef struct dpct_type_105170 {
+    sycl::half2 dm;               // super-block scale for quantized scales/mins
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // quants, high bit
+    uint8_t qs[QK_K/2];           // quants, low 4 bits
+} block_q5_K;
+static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
+#endif
+
+#define QR6_K 2
+#define QI6_K (QK_K / (4*QR6_K))
+typedef struct dpct_type_341564 {
+    uint8_t ql[QK_K/2];   // quants, lower 4 bits
+    uint8_t qh[QK_K/4];   // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales
+    sycl::half d;            // delta
+} block_q6_K;
+static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
+
+#define WARP_SIZE 32
+#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
+
+#define CUDA_GELU_BLOCK_SIZE 256
+#define CUDA_SILU_BLOCK_SIZE 256
+#define CUDA_TANH_BLOCK_SIZE 256
+#define CUDA_RELU_BLOCK_SIZE 256
+#define CUDA_SQR_BLOCK_SIZE 256
+#define CUDA_CPY_BLOCK_SIZE 32
+#define CUDA_SCALE_BLOCK_SIZE 256
+#define CUDA_CLAMP_BLOCK_SIZE 256
+#define CUDA_ROPE_BLOCK_SIZE 256
+#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
+#define CUDA_ALIBI_BLOCK_SIZE 32
+#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
+#define CUDA_QUANTIZE_BLOCK_SIZE 256
+#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
+#define CUDA_UPSCALE_BLOCK_SIZE 256
+#define CUDA_CONCAT_BLOCK_SIZE 256
+#define CUDA_PAD_BLOCK_SIZE 256
+#define CUDA_ACC_BLOCK_SIZE 256
+#define CUDA_IM2COL_BLOCK_SIZE 256
+
+// dmmv = dequantize_mul_mat_vec
+#ifndef GGML_CUDA_DMMV_X
+#define GGML_CUDA_DMMV_X 32
+#endif
+#ifndef GGML_CUDA_MMV_Y
+#define GGML_CUDA_MMV_Y 1
+#endif
+
+#ifndef K_QUANTS_PER_ITERATION
+#define K_QUANTS_PER_ITERATION 2
+#else
+static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
+#endif
+
+#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
+#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
+#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
+
+#define MUL_MAT_SRC1_COL_STRIDE 128
+
+#define MAX_STREAMS 8
+static dpct::queue_ptr g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = {
+    {&dpct::get_in_order_queue()}};
+
+struct ggml_tensor_extra_gpu {
+    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+    dpct::event_ptr events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS];
+    std::chrono::time_point<std::chrono::steady_clock> events_ct1_0[16];
+    std::chrono::time_point<std::chrono::steady_clock>
+        events_ct1_is[16] ; // events for synchronizing multiple GPUs
+};
+
+// this is faster on Windows
+// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
+inline dpct::err0 ggml_cuda_set_device(const int device) try {
+    int current_device;
+    current_device = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(current_device = dpct::dev_mgr::instance().current_device_id());
+
+    if (device == current_device) {
+        return 0;
+    }
+
+    /*
+    DPCT1093:66: The "device" device may be not the one intended for use. Adjust
+    the selected device if needed.
+    */
+    return DPCT_CHECK_ERROR(dpct::select_device(device));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static int g_device_count = -1;
+static int g_main_device = 0;
+static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
+static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+
+static void * g_scratch_buffer = nullptr;
+static size_t g_scratch_size = 0; // disabled by default
+static size_t g_scratch_offset = 0;
+
+static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+static __dpct_inline__ float warp_reduce_sum(float x,
+                                             const sycl::nd_item<3> &item_ct1) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:0: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        /*
+        DPCT1096:127: The right-most dimension of the work-group used in the
+        SYCL kernel that calls this function may be less than "32". The function
+        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
+        CPU device. Modify the size of the work-group to ensure that the value
+        of the right-most dimension is a multiple of "32".
+        */
+        x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
+    }
+    return x;
+}
+
+static __dpct_inline__ sycl::float2
+warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:1: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
+                                                mask);
+        /*
+        DPCT1023:2: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
+                                                mask);
+    }
+    return a;
+}
+
+static __dpct_inline__ float warp_reduce_max(float x,
+                                             const sycl::nd_item<3> &item_ct1) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:3: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        /*
+        DPCT1096:126: The right-most dimension of the work-group used in the
+        SYCL kernel that calls this function may be less than "32". The function
+        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
+        CPU device. Modify the size of the work-group to ensure that the value
+        of the right-most dimension is a multiple of "32".
+        */
+        x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
+                              item_ct1.get_sub_group(), x, mask));
+    }
+    return x;
+}
+
+static __dpct_inline__ float op_repeat(const float a, const float b) {
+    return b;
+}
+
+static __dpct_inline__ float op_add(const float a, const float b) {
+    return a + b;
+}
+
+static __dpct_inline__ float op_mul(const float a, const float b) {
+    return a * b;
+}
+
+static __dpct_inline__ float op_div(const float a, const float b) {
+    return a / b;
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1));
+    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) /
+                   ne3;
+    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                    item_ct1.get_local_id(0)) %
+                   ne3;
+
+    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    for (int i0 = i0s; i0 < ne0;
+         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
+        const int i10 = i0 % ne10;
+        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+    }
+}
+
+template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
+static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
+        int ne0, int ne1, int ne2, int ne3,
+        int ne10, int ne11, int ne12, int ne13,
+        /*int s0, */ int s1,  int s2,  int s3,
+        /*int s10,*/ int s11, int s12, int s13,
+        const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    const int i3 = i/(ne2*ne1*ne0);
+    const int i2 = (i/(ne1*ne0)) % ne2;
+    const int i1 = (i/ne0) % ne1;
+    const int i0 = i % ne0;
+
+    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
+        return;
+    }
+
+    const int i11 = i1 % ne11;
+    const int i12 = i2 % ne12;
+    const int i13 = i3 % ne13;
+
+    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
+    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
+    const size_t i_dst  = i_src0;
+
+    const src0_t * src0_row = src0 + i_src0;
+    const src1_t * src1_row = src1 + i_src1;
+    dst_t * dst_row = dst + i_dst;
+
+    const int i10 = i0 % ne10;
+    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
+}
+
+static void acc_f32(const float * x, const float * y, float * dst, const int ne,
+    const int ne10, const int ne11, const int ne12,
+    const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= ne) {
+        return;
+    }
+    int src1_idx = i - offset;
+    int oz = src1_idx / nb2;
+    int oy = (src1_idx - (oz * nb2)) / nb1;
+    int ox = src1_idx % nb1;
+    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
+        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
+    } else {
+        dst[i] = x[i];
+    }
+}
+
+static void gelu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const float GELU_COEF_A    = 0.044715f;
+    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    float xi = x[i];
+    dst[i] = 0.5f * xi *
+             (1.0f +
+              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
+}
+
+static void silu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
+}
+
+static void gelu_quick_f32(const float *x, float *dst, int k,
+                           const sycl::nd_item<3> &item_ct1) {
+    const float GELU_QUICK_COEF = -1.702f;
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
+}
+
+static void tanh_f32(const float *x, float *dst, int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::tanh((float)(x[i]));
+}
+
+static void relu_f32(const float * x, float * dst, const int k,
+                     const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0);
+}
+
+static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+    if (i >= k) {
+        return;
+    }
+    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
+             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
+}
+
+static void sqr_f32(const float * x, float * dst, const int k,
+                    const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+    dst[i] = x[i] * x[i];
+}
+
+template <int block_size>
+static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
+                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    const int tid = item_ct1.get_local_id(2);
+
+    sycl::float2 mean_var = sycl::float2(0.f, 0.f);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        mean_var.x() += xi;
+        mean_var.y() += xi * xi;
+    }
+
+    // sum up partial sums
+    mean_var = warp_reduce_sum(mean_var, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = mean_var;
+        }
+        /*
+        DPCT1118:4: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        mean_var = s_sum[lane_id];
+        mean_var = warp_reduce_sum(mean_var, item_ct1);
+    }
+
+    const float mean = mean_var.x() / ncols;
+    const float var = mean_var.y() / ncols - mean * mean;
+    const float inv_std = sycl::rsqrt(var + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
+    }
+}
+
+static void concat_f32(const float  *x,const float  *y, float *dst, const int ne0, const int ne02,
+                       const sycl::nd_item<3> &item_ct1) {
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+    // operation
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    if (item_ct1.get_group(0) < ne02) { // src0
+        int offset_src =
+            nidx + item_ct1.get_group(1) * ne0 +
+            item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+            dst[offset_dst] = x[offset_src];
+    } else {
+        int offset_src =
+            nidx + item_ct1.get_group(1) * ne0 +
+            (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
+            dst[offset_dst] = y[offset_src];
+    }
+}
+
+static void upscale_f32(const float  *x, float *dst, const int ne00, const int nb02, const int scale_factor,
+                        const sycl::nd_item<3> &item_ct1) {
+    int ne0 = ne00 * scale_factor;
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+    // operation
+    int i00 = nidx / scale_factor;
+    int i01 = item_ct1.get_group(1) / scale_factor;
+    int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    dst[offset_dst] = x[offset_src];
+}
+
+static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
+                    const sycl::nd_item<3> &item_ct1) {
+    int nidx = item_ct1.get_local_id(2) +
+               item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (nidx >= ne0) {
+        return;
+    }
+
+    // operation
+    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
+                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
+    if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
+        item_ct1.get_group(0) < ne02) {
+        int offset_src = nidx + item_ct1.get_group(1) * ne00 +
+                         item_ct1.get_group(0) * ne00 * ne01;
+            dst[offset_dst] = x[offset_src];
+    } else {
+        dst[offset_dst] = 0.0f;
+    }
+}
+
+template <int block_size>
+static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
+                           const sycl::nd_item<3> &item_ct1, float *s_sum) {
+    int start = item_ct1.get_group(2) * group_size;
+    int end = start + group_size;
+
+    start += item_ct1.get_local_id(2);
+
+    if (end >= ne_elements) {
+        end = ne_elements;
+    }
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int j = start; j < end; j += block_size) {
+        tmp += x[j];
+    }
+
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:5: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:67: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    float mean = tmp / group_size;
+    tmp = 0.0f;
+
+    for (int j = start; j < end; j += block_size) {
+        float xi = x[j] - mean;
+        dst[j] = xi;
+        tmp += xi * xi;
+    }
+
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:6: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:68: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    float variance = tmp / group_size;
+    float scale = sycl::rsqrt(variance + eps);
+    for (int j = start; j < end; j += block_size) {
+        dst[j] *= scale;
+    }
+}
+
+template <int block_size>
+static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
+                         const sycl::nd_item<3> &item_ct1, float *s_sum) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    const int tid = item_ct1.get_local_id(2);
+
+    float tmp = 0.0f; // partial sum for thread in warp
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const float xi = x[row*ncols + col];
+        tmp += xi * xi;
+    }
+
+    // sum up partial sums
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+
+        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+        if (lane_id == 0) {
+            s_sum[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:7: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        item_ct1.barrier(sycl::access::fence_space::local_space);
+        tmp = s_sum[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    const float mean = tmp / ncols;
+    const float scale = sycl::rsqrt(mean + eps);
+
+    for (int col = tid; col < ncols; col += block_size) {
+        dst[row*ncols + col] = scale * x[row*ncols + col];
+    }
+}
+
+static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {8.0f, 8.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x() = (v.x() - 8.0f) * d;
+    v.y() = (v.y() - 8.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const dfloat d = x[ib].dm[1];
+    const dfloat m = x[ib].dm[0];
+
+    const int vui = x[ib].qs[iqs];
+
+    v.x() = vui & 0xF;
+    v.y() = vui >> 4;
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x() = (v.x() * d) + m;
+    v.y() = (v.y() * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hsub2(v, {16.0f, 16.0f});
+    v = __hmul2(v, {d, d});
+#else
+    v.x() = (v.x() - 16.0f) * d;
+    v.y() = (v.y() - 16.0f) * d;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const dfloat d = x[ib].dm[1];
+    const dfloat m = x[ib].dm[0];
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+    v = __hadd2(v, {m, m});
+#else
+    v.x() = (v.x() * d) + m;
+    v.y() = (v.y() * d) + m;
+#endif // GGML_CUDA_F16
+}
+
+static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
+                                            const int iqs, dfloat2 &v) {
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const dfloat d = x[ib].d;
+
+    v.x() = x[ib].qs[iqs + 0];
+    v.y() = x[ib].qs[iqs + 1];
+
+#ifdef GGML_CUDA_F16
+    v = __hmul2(v, {d, d});
+#else
+    v.x() *= d;
+    v.y() *= d;
+#endif // GGML_CUDA_F16
+}
+
+//================================== k-quants
+
+template<typename dst_t>
+static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_q2_K * x = (const block_q2_K *) vx;
+
+    const int tid = item_ct1.get_local_id(2);
+#if QK_K == 256
+    const int n   = tid/32;
+    const int l   = tid - 32*n;
+    const int is  = 8*n + l/16;
+
+    const uint8_t q = x[i].qs[32*n + l];
+    dst_t * y = yy + i*QK_K + 128*n;
+
+    float dall = x[i].dm[1];
+    float dmin = x[i].dm[0];
+    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
+    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
+    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
+#else
+    const int is = tid/16;  // 0 or 1
+    const int il = tid%16;  // 0...15
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    dst_t * y = yy + i*QK_K + 16*is + il;
+    float dall = __low2half(x[i].dm);
+    float dmin = __high2half(x[i].dm);
+    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
+    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
+#endif
+
+}
+
+template<typename dst_t>
+static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+
+    const int i = item_ct1.get_group(2);
+    const block_q3_K * x = (const block_q3_K *) vx;
+
+#if QK_K == 256
+    const int r = item_ct1.get_local_id(2) / 4;
+    const int tid = r/2;
+    const int is0 = r%2;
+    const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
+    const int n = tid / 4;
+    const int j = tid - 4*n;
+
+    uint8_t m = 1 << (4*n + j);
+    int is = 8*n + 2*j + is0;
+    int shift = 2*j;
+
+    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
+                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
+                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
+                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
+    float d_all = x[i].d;
+    float dl = d_all * (us - 32);
+
+    dst_t * y = yy + i*QK_K + 128*n + 32*j;
+    const uint8_t * q = x[i].qs + 32*n;
+    const uint8_t * hm = x[i].hmask;
+
+    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
+#else
+    const int tid = threadIdx.x;
+    const int is  = tid/16;  // 0 or 1
+    const int il  = tid%16;  // 0...15
+    const int im  = il/8;    // 0...1
+    const int in  = il%8;    // 0...7
+
+    dst_t * y = yy + i*QK_K + 16*is + il;
+
+    const uint8_t q = x[i].qs[il] >> (2*is);
+    const uint8_t h = x[i].hmask[in] >> (2*is + im);
+    const float   d = (float)x[i].d;
+
+    if (is == 0) {
+        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    } else {
+        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
+        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
+    }
+#endif
+
+}
+
+#if QK_K == 256
+static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
+    if (j < 4) {
+        d = q[j] & 63; m = q[j + 4] & 63;
+    } else {
+        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
+        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
+    }
+}
+#endif
+
+template<typename dst_t>
+static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q4_K * x = (const block_q4_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 32 threads
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/8;
+    const int ir  = tid%8;
+    const int is  = 2*il;
+    const int n   = 4;
+
+    dst_t * y = yy + i*QK_K + 64*il + n*ir;
+
+    const float dall = x[i].dm[1];
+    const float dmin = x[i].dm[0];
+
+    const uint8_t * q = x[i].qs + 32*il + n*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+    for (int l = 0; l < n; ++l) {
+        y[l + 0] = d1 * (q[l] & 0xF) - m1;
+        y[l +32] = d2 * (q[l] >>  4) - m2;
+    }
+#else
+    const int tid = threadIdx.x;
+    const uint8_t * q = x[i].qs;
+    dst_t * y = yy + i*QK_K;
+    const float d = (float)x[i].dm[0];
+    const float m = (float)x[i].dm[1];
+    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
+    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q5_K * x = (const block_q5_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+
+#if QK_K == 256
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = item_ct1.get_local_id(2);
+    const int il  = tid/16;   // il is in 0...3
+    const int ir  = tid%16;   // ir is in 0...15
+    const int is  = 2*il;     // is is in 0...6
+
+    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
+
+    const float dall = x[i].dm[1];
+    const float dmin = x[i].dm[0];
+
+    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
+    const uint8_t * qh = x[i].qh + 2*ir;
+
+    uint8_t sc, m;
+    get_scale_min_k4(is + 0, x[i].scales, sc, m);
+    const float d1 = dall * sc; const float m1 = dmin * m;
+    get_scale_min_k4(is + 1, x[i].scales, sc, m);
+    const float d2 = dall * sc; const float m2 = dmin * m;
+
+    uint8_t   hm  = 1 << (2*il);
+    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
+    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
+    hm <<= 1;
+    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
+    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
+#else
+    const int tid = threadIdx.x;
+    const uint8_t q = x[i].qs[tid];
+    const int im = tid/8;  // 0...3
+    const int in = tid%8;  // 0...7
+    const int is = tid/16; // 0 or 1
+    const uint8_t h = x[i].qh[in] >> im;
+    const float d = x[i].d;
+    dst_t * y = yy + i*QK_K + tid;
+    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
+    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
+#endif
+}
+
+template<typename dst_t>
+static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
+                                  const sycl::nd_item<3> &item_ct1) {
+    const block_q6_K * x = (const block_q6_K *) vx;
+
+    const int i = item_ct1.get_group(2);
+#if QK_K == 256
+
+    // assume 64 threads - this is very slightly better than the one below
+    const int tid = item_ct1.get_local_id(2);
+    const int ip  = tid/32;   // ip is 0 or 1
+    const int il  = tid - 32*ip; // 0...32
+    const int is  = 8*ip + il/16;
+
+    dst_t * y = yy + i*QK_K + 128*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t * ql = x[i].ql + 64*ip + il;
+    const uint8_t   qh = x[i].qh[32*ip + il];
+    const int8_t  * sc = x[i].scales + is;
+
+    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
+    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
+#else
+
+    // assume 32 threads
+    const int tid = threadIdx.x;
+    const int ip  = tid/16;         // 0 or 1
+    const int il  = tid - 16*ip;    // 0...15
+
+    dst_t * y = yy + i*QK_K + 16*ip + il;
+
+    const float d = x[i].d;
+
+    const uint8_t   ql = x[i].ql[16*ip + il];
+    const uint8_t   qh = x[i].qh[il] >> (2*ip);
+    const int8_t  * sc = x[i].scales;
+
+    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
+    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
+#endif
+}
+
+/*
+DPCT1110:8: The total declared local variable size in device function
+dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q2_K * x = (const block_q2_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
+    const int q_offset = 32*im + l0;
+    const int s_offset = 8*im;
+    const int y_offset = 128*im + l0;
+
+    uint32_t aux[4];
+    const uint8_t * d = (const uint8_t *)aux;
+    const uint8_t * m = (const uint8_t *)(aux + 2);
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+
+        const float dall = x[i].dm[1];
+        const float dmin = x[i].dm[0];
+
+        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
+        aux[0] = a[0] & 0x0f0f0f0f;
+        aux[1] = a[1] & 0x0f0f0f0f;
+        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
+        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
+                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
+                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
+                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
+                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
+                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
+                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
+                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
+            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
+                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
+
+        }
+        tmp += dall * sum1 - dmin * sum2;
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;
+
+    uint32_t uaux[2];
+    const uint8_t * d = (const uint8_t *)uaux;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint32_t * s = (const uint32_t *)x[i].scales;
+
+        uaux[0] = s[0] & 0x0f0f0f0f;
+        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
+
+        const float2 dall = __half22float2(x[i].dm);
+
+        float sum1 = 0, sum2 = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t ql = q[l];
+            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
+                  + y[l+16] * d[1] * ((ql >> 2) & 3)
+                  + y[l+32] * d[2] * ((ql >> 4) & 3)
+                  + y[l+48] * d[3] * ((ql >> 6) & 3);
+            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
+        }
+        tmp += dall.x * sum1 - dall.y * sum2;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:9: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:10: The total declared local variable size in device function
+dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q3_K * x = (const block_q3_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+
+    const uint16_t kmask1 = 0x0303;
+    const uint16_t kmask2 = 0x0f0f;
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
+    const int step = 16/K_QUANTS_PER_ITERATION;
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0....15 or 0...7
+
+    const uint8_t m = 1 << (4*im);
+
+    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
+    const int q_offset =  32*im + l0;
+    const int y_offset = 128*im + l0;
+
+    uint16_t utmp[4];
+    const int8_t * s = (const int8_t *)utmp;
+
+    const uint16_t s_shift = 4*im;
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * q = x[i].qs + q_offset;
+        const uint8_t * h = x[i].hmask + l0;
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
+        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
+        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
+
+        const float d = x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < n; ++l) {
+            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
+                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
+                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
+                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
+            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
+                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
+                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
+                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
+        }
+        tmp += d * sum;
+
+    }
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
+    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
+    const int in = offset/8;                                 // 0 or 1
+    const int im = offset%8;                                 // 0...7
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y = yy + i * QK_K + offset;
+        const uint8_t * q = x[i].qs + offset;
+        const uint8_t * s = x[i].scales;
+
+        const float dall = (float)x[i].d;
+
+        float sum = 0;
+        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
+            const uint8_t hl = x[i].hmask[im+l] >> in;
+            const uint8_t ql = q[l];
+            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
+                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
+                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
+                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:11: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:12: The total declared local variable size in device function
+dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q4_K * x = (const block_q4_K *)vx + ib0;
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
+
+    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
+
+    const int il  = tid/step;                            // 0...3
+    const int ir  = tid - step*il;                       // 0...7 or 0...3
+    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+#if K_QUANTS_PER_ITERATION == 2
+    uint32_t q32[4];
+    const uint8_t * q4 = (const uint8_t *)q32;
+#else
+    uint16_t q16[4];
+    const uint8_t * q4 = (const uint8_t *)q16;
+#endif
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y1 = yy + i*QK_K + y_offset;
+        const float   * y2 = y1 + 128;
+
+        const float dall = x[i].dm[1];
+        const float dmin = x[i].dm[0];
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+#if K_QUANTS_PER_ITERATION == 2
+        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
+        const uint32_t * q2 = q1 + 16;
+
+        q32[0] = q1[0] & 0x0f0f0f0f;
+        q32[1] = q1[0] & 0xf0f0f0f0;
+        q32[2] = q2[0] & 0x0f0f0f0f;
+        q32[3] = q2[0] & 0xf0f0f0f0;
+
+        sycl::float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 4; ++l) {
+            s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4];
+            s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
+                       s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
+               dmin * smin;
+#else
+        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
+        const uint16_t * q2 = q1 + 32;
+
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[0] & 0xf0f0;
+        q16[2] = q2[0] & 0x0f0f;
+        q16[3] = q2[0] & 0xf0f0;
+
+        float4 s = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        for (int l = 0; l < 2; ++l) {
+            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
+            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
+            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
+        }
+        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
+#endif
+
+    }
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    float tmp = 0;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const float   * y = yy + i*QK_K + step;
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux16[0] = a[0] & 0x0f0f;
+        aux16[1] = (a[0] >> 4) & 0x0f0f;
+        const float d = (float)x[i].dm[0];
+        const float m = (float)x[i].dm[1];
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
+                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
+                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
+                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
+        }
+        tmp += sum;
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:13: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+/*
+DPCT1110:14: The total declared local variable size in device function
+dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
+pressure. Consult with your hardware vendor to find the total register size
+available and adjust the code, or use smaller sub-group size to avoid high
+register pressure.
+*/
+static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
+                                        const float *__restrict__ yy,
+                                        float *__restrict__ dst,
+                                        const int ncols,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    const int row = item_ct1.get_group(2);
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q5_K * x = (const block_q5_K *)vx + ib0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+#if QK_K == 256
+    const uint16_t kmask1 = 0x3f3f;
+    const uint16_t kmask2 = 0x0f0f;
+    const uint16_t kmask3 = 0xc0c0;
+
+    const int tid = item_ct1.get_local_id(2) / 2; // 0...15
+    const int ix = item_ct1.get_local_id(2) % 2;
+
+    const int il  = tid/4;     // 0...3
+    const int ir  = tid - 4*il;// 0...3
+    const int n   = 2;
+
+    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
+    const int in = il%2;
+
+    const int l0 = n*(2*ir + in);
+    const int q_offset = 32*im + l0;
+    const int y_offset = 64*im + l0;
+
+    const uint8_t hm1  = 1 << (2*im);
+    const uint8_t hm2  = hm1 << 4;
+
+    uint16_t aux[4];
+    const uint8_t * sc = (const uint8_t *)aux;
+
+    uint16_t q16[8];
+    const uint8_t * q4 = (const uint8_t *)q16;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2) {
+
+        const uint8_t * ql1 = x[i].qs + q_offset;
+        const uint8_t * qh  = x[i].qh + l0;
+        const float   * y1  = yy + i*QK_K + y_offset;
+        const float   * y2  = y1 + 128;
+
+        const float dall = x[i].dm[1];
+        const float dmin = x[i].dm[0];
+
+        const uint16_t * a = (const uint16_t *)x[i].scales;
+        aux[0] = a[im+0] & kmask1;
+        aux[1] = a[im+2] & kmask1;
+        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
+        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
+
+        sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
+        float smin = 0;
+        const uint16_t * q1 = (const uint16_t *)ql1;
+        const uint16_t * q2 = q1 + 32;
+        q16[0] = q1[0] & 0x0f0f;
+        q16[1] = q1[8] & 0x0f0f;
+        q16[2] = (q1[0] >> 4) & 0x0f0f;
+        q16[3] = (q1[8] >> 4) & 0x0f0f;
+        q16[4] = q2[0] & 0x0f0f;
+        q16[5] = q2[8] & 0x0f0f;
+        q16[6] = (q2[0] >> 4) & 0x0f0f;
+        q16[7] = (q2[8] >> 4) & 0x0f0f;
+        for (int l = 0; l < n; ++l) {
+            sum.x() +=
+                y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
+                y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
+            sum.y() +=
+                y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
+                y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
+            sum.z() +=
+                y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
+                y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
+            sum.w() +=
+                y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
+                y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
+            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
+                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
+        }
+        tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
+                       sum.w() * sc[5]) -
+               dmin * smin;
+    }
+
+#else
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
+    const int step = tid * K_QUANTS_PER_ITERATION;
+    const int im = step/8;
+    const int in = step%8;
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+        const uint8_t * q = x[i].qs + step;
+        const int8_t  * s = x[i].scales;
+        const float   * y = yy + i*QK_K + step;
+        const float     d = x[i].d;
+        float sum = 0.f;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            const uint8_t h = x[i].qh[in+j] >> im;
+            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
+                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
+                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
+                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
+        }
+        tmp += sum;
+    }
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:15: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
+                                        const sycl::nd_item<3> &item_ct1) {
+
+    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
+
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+    if (row > nrows) return;
+
+    const int num_blocks_per_row = ncols / QK_K;
+    const int ib0 = row*num_blocks_per_row;
+
+    const block_q6_K * x = (const block_q6_K *)vx + ib0;
+
+#if QK_K == 256
+
+    const int tid =
+        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
+    const int ix =
+        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
+
+    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
+
+    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
+    const int in = tid - step*im;                        // 0...15 or 0...7
+
+#if K_QUANTS_PER_ITERATION == 1
+    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
+    const int is = 0;
+#else
+    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
+    const int is = in / 4;
+#endif
+    const int ql_offset = 64*im + l0;
+    const int qh_offset = 32*im + l0;
+    const int s_offset  =  8*im + is;
+    const int y_offset = 128*im + l0;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + y_offset;
+        const uint8_t * ql = x[i].ql + ql_offset;
+        const uint8_t * qh = x[i].qh + qh_offset;
+        const int8_t  * s  = x[i].scales + s_offset;
+
+        const float d = x[i].d;
+
+#if K_QUANTS_PER_ITERATION == 1
+        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
+                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
+                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
+                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
+                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
+        tmp += sum;
+#else
+        float sum = 0;
+        for (int l = 0; l < 4; ++l) {
+            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
+                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
+                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
+                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
+        }
+        tmp += sum;
+#endif
+
+    }
+
+#else
+
+    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
+    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
+
+    const int step = tid * K_QUANTS_PER_ITERATION;
+
+    float tmp = 0; // partial sum for thread in warp
+
+    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
+
+        const float   * y  = yy + i * QK_K + step;
+        const uint8_t * ql = x[i].ql + step;
+        const uint8_t * qh = x[i].qh + step;
+        const int8_t  * s  = x[i].scales;
+
+        const float d = x[i+0].d;
+
+        float sum = 0;
+        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
+            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
+                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
+                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
+                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
+        }
+        tmp += sum;
+
+    }
+
+#endif
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:16: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+        dst[row] = tmp;
+    }
+}
+
+static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const sycl::half *x = (const sycl::half *)vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+
+static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
+    const float * x = (const float *) vx;
+
+    // automatic half -> float type cast if dfloat == float
+    v.x() = x[ib + iqs + 0];
+    v.y() = x[ib + iqs + 1];
+}
+
+static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                   item_ct1.get_local_id(2);
+
+    if (ix >= kx_padded) {
+        return;
+    }
+
+    const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                   item_ct1.get_local_id(1);
+
+    const int i_padded = iy*kx_padded + ix;
+
+    block_q8_1 * y = (block_q8_1 *) vy;
+
+    const int ib = i_padded / QK8_1; // block index
+    const int iqs = i_padded % QK8_1; // quant index
+
+    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
+    float amax = sycl::fabs((float)xi);
+    float sum = xi;
+
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:17: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
+                                    item_ct1.get_sub_group(), amax, mask));
+        /*
+        DPCT1023:18: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        sum +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
+    }
+
+    const float d = amax / 127;
+    const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d);
+
+    y[ib].qs[iqs] = q;
+
+    if (iqs > 0) {
+        return;
+    }
+
+    reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
+    reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
+}
+
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void k_get_rows(
+            const void * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                     item_ct1.get_local_id(2)) *
+                    2;
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
+
+    const int ib = i00/qk; // block index
+    const int iqs = (i00%qk)/qr; // quant index
+    const int iybs = i00 - i00%qk; // dst block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(src0_row, ib, iqs, v);
+
+    dst_row[iybs + iqs + 0] = v.x();
+    dst_row[iybs + iqs + y_offset] = v.y();
+}
+
+template<typename src0_t, typename dst_t>
+static void k_get_rows_float(
+            const src0_t * src0, const int32_t * src1, dst_t * dst,
+            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
+            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
+            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
+            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
+            size_t s10, size_t s11, size_t s12,
+            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
+
+    const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+                    item_ct1.get_local_id(2);
+    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) /
+                    ne12;
+    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
+                     item_ct1.get_local_id(0)) %
+                    ne12;
+
+    if (i00 >= ne00) {
+        return;
+    }
+
+    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
+
+    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
+    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
+
+    dst_row[i00] = src0_row[i00];
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
+                             const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  2 * item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    const int ib = i/qk; // block index
+    const int iqs = (i%qk)/qr; // quant index
+    const int iybs = i - i%qk; // y block start index
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    // dequantize
+    dfloat2 v;
+    dequantize_kernel(vx, ib, iqs, v);
+
+    y[iybs + iqs + 0] = v.x();
+    y[iybs + iqs + y_offset] = v.y();
+}
+
+// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
+// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
+
+#define VDR_Q4_0_Q8_1_MMVQ 2
+#define VDR_Q4_0_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
+                                                    const float &d4,
+                                                    const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 8 from each quant value
+    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
+#else
+    /*
+    DPCT1007:69: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_1_Q8_1_MMVQ 2
+#define VDR_Q4_1_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
+                                                    const sycl::half2 &dm4,
+                                                    const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
+        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
+
+        // SIMD dot product of quantized values
+        sumi = __dp4a(vi0, u[2*i+0], sumi);
+        sumi = __dp4a(vi1, u[2*i+1], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
+    const float d4d8 = tmp.x;
+    const float m4s8 = tmp.y;
+#else
+    const float2 dm4f = __half22float2(dm4);
+    const float2 ds8f = __half22float2(ds8);
+    const float d4d8 = dm4f.x * ds8f.x;
+    const float m4s8 = dm4f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
+    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
+#else
+    /*
+    DPCT1007:70: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_0_Q8_1_MMVQ 2
+#define VDR_Q5_0_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
+                       const float &d5, const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+    const float2 ds8f = __half22float2(ds8);
+
+    // second part effectively subtracts 16 from each quant value
+    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
+#else
+    /*
+    DPCT1007:71: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_1_Q8_1_MMVQ 2
+#define VDR_Q5_1_Q8_1_MMQ  4
+
+template <int vdr>
+static __dpct_inline__ float
+vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
+                       const sycl::half2 &dm5, const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
+        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
+        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
+        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
+        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
+        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
+
+        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
+        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
+        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
+        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
+        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
+        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
+    const float d5d8 = tmp.x;
+    const float m5s8 = tmp.y;
+#else
+    const float2 dm5f = __half22float2(dm5);
+    const float2 ds8f = __half22float2(ds8);
+    const float d5d8 = dm5f.x * ds8f.x;
+    const float m5s8 = dm5f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
+    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
+
+#else
+    /*
+    DPCT1007:72: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q8_0_Q8_1_MMVQ 2
+#define VDR_Q8_0_Q8_1_MMQ 8
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
+                                                    const float &d8_0,
+                                                    const float &d8_1) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+    return d8_0*d8_1 * sumi;
+#else
+    /*
+    DPCT1007:73: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+template <int vdr>
+static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
+                                                    const sycl::half2 &dm8,
+                                                    const sycl::half2 &ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i = 0; i < vdr; ++i) {
+        // SIMD dot product of quantized values
+        sumi = __dp4a(v[i], u[i], sumi);
+    }
+
+#ifdef GGML_CUDA_F16
+    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
+    const float d8d8 = tmp.x;
+    const float m8s8 = tmp.y;
+#else
+    const float2 dm8f = __half22float2(dm8);
+    const float2 ds8f = __half22float2(ds8);
+    const float d8d8 = dm8f.x * ds8f.x;
+    const float m8s8 = dm8f.y * ds8f.y;
+#endif // GGML_CUDA_F16
+
+    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
+    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
+#else
+    /*
+    DPCT1007:74: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q2_K_Q8_1_MMVQ 1
+#define VDR_Q2_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
+    const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
+    const sycl::half2 &dm2, const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++i) {
+        const int sc = scales[2*i];
+
+        const int vi = (v >> (2*i)) & 0x03030303;
+
+        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return dm2f.x*sumf_d - dm2f.y*sumf_m;
+#else
+    /*
+    DPCT1007:75: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const uint8_t *__restrict__ scales,
+                           const sycl::half2 &dm2, const float &d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi_d = 0;
+    int sumi_m = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
+        int sumi_d_sc = 0;
+
+        const int sc = scales[i0 / (QI8_1/2)];
+
+        // fill int with 4x m
+        int m = sc >> 4;
+        m |= m <<  8;
+        m |= m << 16;
+
+#pragma unroll
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
+            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
+        }
+
+        sumi_d += sumi_d_sc * (sc & 0xF);
+    }
+
+    const float2 dm2f = __half22float2(dm2);
+
+    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
+#else
+    /*
+    DPCT1007:76: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q3_K_Q8_1_MMVQ 1
+#define VDR_Q3_K_Q8_1_MMQ  2
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
+    const int &vl, const int &vh, const int *__restrict__ u,
+    const uint8_t *__restrict__ scales, const int &scale_offset,
+    const float &d3, const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        const int isc = scale_offset + 2*i;
+
+        const int isc_low = isc % (QK_K/32);
+        const int sc_shift_low = 4 * (isc / (QK_K/32));
+        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
+
+        const int isc_high = isc % (QK_K/64);
+        const int sc_shift_high = 2 * (isc / (QK_K/64));
+        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
+
+        const int sc = (sc_low | sc_high) - 32;
+
+        const int vil = (vl >> (2*i)) & 0x03030303;
+
+        const int vih = ((vh >> i) << 2) & 0x04040404;
+
+        const int vi = __vsubss4(vil, vih);
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d3 * sumf;
+#else
+    /*
+    DPCT1007:77: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const int8_t *__restrict__ scales, const float &d3,
+                           const float &d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    int sumi = 0;
+
+#pragma unroll
+    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
+        int sumi_sc = 0;
+
+        for (int i = i0; i < i0 + QI8_1/2; ++i) {
+            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
+        }
+
+        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
+    }
+
+    return d3*d8 * sumi;
+#else
+    /*
+    DPCT1007:78: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q4_K_Q8_1_MMVQ 2
+#define VDR_Q4_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K; ++i) {
+        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
+        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    /*
+    DPCT1007:79: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    /*
+    DPCT1007:80: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q5_K_Q8_1_MMVQ 2
+#define VDR_Q5_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
+    const int *__restrict__ vl, const int *__restrict__ vh,
+    const int *__restrict__ u, const uint8_t *__restrict__ sc,
+    const uint8_t *__restrict__ m, const sycl::half2 &dm5,
+    const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
+        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
+
+        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
+        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
+
+        const int v0i = vl0i | vh0i;
+        const int v1i = vl1i | vh1i;
+
+        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
+        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
+
+        sumf_d += d8[i] * (dot1 * sc[i]);
+        sumf_m += d8[i] * (dot2 * m[i]);
+
+    }
+
+    const float2 dm5f = __half22float2(dm5);
+
+    return dm5f.x*sumf_d - dm5f.y*sumf_m;
+
+#else
+    /*
+    DPCT1007:81: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
+    const int *__restrict__ v, const int *__restrict__ u,
+    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
+    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
+        int sumi_d = 0;
+
+#pragma unroll
+        for (int j = 0; j < QI8_1; ++j) {
+            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
+        }
+
+        const float2 ds8f = __half22float2(ds8[i]);
+
+        sumf_d += ds8f.x * (sc[i] * sumi_d);
+        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
+    }
+
+    const float2 dm4f = __half22float2(dm4);
+
+    return dm4f.x*sumf_d - dm4f.y*sumf_m;
+
+#else
+    /*
+    DPCT1007:82: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+#define VDR_Q6_K_Q8_1_MMVQ 1
+#define VDR_Q6_K_Q8_1_MMQ  8
+
+// contiguous v/x values
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
+                            const int *__restrict__ u,
+                            const int8_t *__restrict__ scales, const float &d,
+                            const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf = 0.0f;
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        const int sc = scales[4*i];
+
+        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
+
+        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
+
+        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
+
+        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
+    }
+
+    return d*sumf;
+#else
+    /*
+    DPCT1007:83: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+// contiguous u/y values
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
+                           const int8_t *__restrict__ sc, const float &d6,
+                           const float *__restrict__ d8) {
+
+#if DPCT_COMPATIBILITY_TEMP >=                                                 \
+    MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    float sumf_d = 0.0f;
+
+#pragma unroll
+    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
+        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
+
+#pragma unroll
+        for (int i = i0; i < i0 + 2; ++i) {
+            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
+            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
+
+            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
+            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
+        }
+
+        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
+    }
+
+    return d6 * sumf_d;
+
+#else
+    /*
+    DPCT1007:84: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+}
+
+static __dpct_inline__ float
+vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
+
+    int v[VDR_Q4_0_Q8_1_MMVQ];
+    int u[2*VDR_Q4_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
+        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs, float *tile_x_d) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs;
+    *x_dm = (sycl::half2 *)tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_0;
+    const int kqsx = k % QI4_0;
+
+    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
+
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
+        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const float * x_dmf = (const float *) x_dm;
+
+    int u[2*VDR_Q4_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __dpct_inline__ float
+vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
+
+    int v[VDR_Q4_1_Q8_1_MMVQ];
+    int u[2*VDR_Q4_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
+        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs, sycl::half2 *tile_x_dm) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_1;
+    const int kqsx = k % QI4_1;
+
+    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
+        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+
+    int u[2*VDR_Q4_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
+         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
+
+    int vl[VDR_Q5_0_Q8_1_MMVQ];
+    int vh[VDR_Q5_0_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
+        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
+        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
+    }
+
+    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, float *tile_x_d) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_ql;
+    *x_dm = (sycl::half2 *)tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_0;
+    const int kqsx = k % QI5_0;
+
+    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
+
+        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
+        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
+        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
+        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
+        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
+        qs0 = dpct::vectorized_binary<sycl::char4>(
+            qs0, 0x10101010, dpct::sub_sat()); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
+        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
+        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
+        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
+        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
+        qs1 = dpct::vectorized_binary<sycl::char4>(
+            qs1, 0x10101010, dpct::sub_sat()); // subtract 16
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
+        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    int u[2*VDR_Q5_0_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
+
+    int vl[VDR_Q5_1_Q8_1_MMVQ];
+    int vh[VDR_Q5_1_Q8_1_MMVQ];
+    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
+        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
+        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
+        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
+    }
+
+    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset < nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_1;
+    const int kqsx = k % QI5_1;
+
+    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
+
+        int qs0 = (ql >>  0) & 0x0F0F0F0F;
+        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
+        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
+        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
+        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
+
+        int qs1 = (ql >>  4) & 0x0F0F0F0F;
+        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
+        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
+        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
+        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
+
+        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
+        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
+    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
+
+    int u[2*VDR_Q5_1_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
+        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
+        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
+    }
+
+    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
+        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
+}
+
+static __dpct_inline__ float
+vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
+
+    int v[VDR_Q8_0_Q8_1_MMVQ];
+    int u[VDR_Q8_0_Q8_1_MMVQ];
+
+#pragma unroll
+    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
+        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
+        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
+    }
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
+                                                      bq8_1->ds[1]);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_qs, float *tile_x_d) {
+    (void)x_qh; (void)x_sc;
+
+    *x_ql = tile_x_qs;
+    *x_dm = (sycl::half2 *)tile_x_d;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh; (void)x_sc;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI8_0;
+    const int kqsx = k % QI8_0;
+    float * x_dmf = (float *) x_dm;
+
+    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
+        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh; (void)x_sc;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
+        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
+         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
+
+    const int bq8_offset = QR2_K * (iqs / QI8_1);
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const uint8_t * scales = bq2_K->scales + scale_offset;
+
+    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
+    int    u[QR2_K];
+    float d8[QR2_K];
+
+#pragma unroll
+    for (int i = 0; i < QR2_K; ++ i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + i].ds[1];
+    }
+
+    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI2_K;
+    const int kqsx = k % QI2_K;
+
+    const block_q2_K * bx0 = (const block_q2_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
+        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
+    }
+}
+
+static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const int kbx = k / QI2_K;
+    const int ky  = (k % QI2_K) * QR2_K;
+    const float * y_df = (const float *) y_ds;
+
+    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
+
+    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
+    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
+
+#pragma unroll
+    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
+        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
+    }
+
+    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
+
+    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
+    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
+
+    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
+    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
+
+    const float d = bq3_K->d;
+
+    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
+
+    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
+
+    int    u[QR3_K];
+    float d8[QR3_K];
+
+#pragma unroll
+    for (int i = 0; i < QR3_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + i].ds[1];
+    }
+
+    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh,
+                    int *tile_x_sc) {
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_qh = tile_x_qh;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI3_K;
+    const int kqsx = k % QI3_K;
+
+    const block_q3_K * bx0 = (const block_q3_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
+    const int kbxd = k % blocks_per_tile_x_row;
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
+        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
+        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
+
+        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
+        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
+        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
+
+        const int ksc = k % (QI3_K/4);
+
+        const int ksc_low = ksc % (QI3_K/8);
+        const int shift_low = 4 * (ksc / (QI3_K/8));
+        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
+
+        const int ksc_high = QI3_K/8;
+        const int shift_high = 2 * ksc;
+        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
+
+        const int sc = dpct::vectorized_binary<sycl::char4>(
+            sc_low | sc_high, 0x20202020, dpct::sub_sat());
+
+        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+
+    const int kbx  = k / QI3_K;
+    const int ky  = (k % QI3_K) * QR3_K;
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
+
+    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
+
+#pragma unroll
+    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
+        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
+        const int shift = 2 * ((ky % 32) / 8);
+        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
+
+        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
+        const int vlh = (vh << 2) & 0x04040404;
+
+        v[l] = dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat());
+    }
+
+    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
+    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+#ifndef GGML_QKK_64
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    int    v[2];
+    int    u[2*QR4_K];
+    float d8[QR4_K];
+
+    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
+    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
+
+    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
+    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
+    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
+    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
+
+    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    v[0] = q4[0];
+    v[1] = q4[4];
+
+    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+    for (int i = 0; i < QR4_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = bq8i->ds[1];
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
+
+    float sumf_d = 0.0f;
+    float sumf_m = 0.0f;
+
+    uint16_t aux16[2];
+    const uint8_t * s = (const uint8_t *)aux16;
+
+    const uint16_t * a = (const uint16_t *)bq4_K->scales;
+    aux16[0] = a[0] & 0x0f0f;
+    aux16[1] = (a[0] >> 4) & 0x0f0f;
+
+    const float dall = bq4_K->dm[0];
+    const float dmin = bq4_K->dm[1];
+
+    const float d8_1 = __low2float(bq8_1[0].ds);
+    const float d8_2 = __low2float(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
+    const int v1 = q4[0];
+    const int v2 = q4[4];
+
+    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
+    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
+    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
+    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
+
+    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
+    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
+
+    return dall * sumf_d - dmin * sumf_m;
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI4_K; // == k if QK_K == 256
+
+    const block_q4_K * bx0 = (const block_q4_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
+
+        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
+        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
+#else
+        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
+
+    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
+    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+#ifndef GGML_QKK_64
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    int   vl[2];
+    int   vh[2];
+    int    u[2*QR5_K];
+    float d8[QR5_K];
+
+    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
+    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
+    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
+
+    vl[0] = ql[0];
+    vl[1] = ql[4];
+
+    vh[0] = qh[0] >> bq8_offset;
+    vh[1] = qh[4] >> bq8_offset;
+
+    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
+    uint16_t aux[2];
+    const int j = bq8_offset/2;
+    if (j < 2) {
+        aux[0] = scales[j+0] & 0x3f3f;
+        aux[1] = scales[j+2] & 0x3f3f;
+    } else {
+        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
+        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
+    }
+    const uint8_t * sc = (const uint8_t *)aux;
+    const uint8_t * m  = sc + 2;
+
+#pragma unroll
+    for (int i = 0; i < QR5_K; ++i) {
+        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
+        d8[i] = bq8i->ds[0];
+
+        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
+        u[2*i+0] = q8[0];
+        u[2*i+1] = q8[4];
+    }
+
+    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
+
+#else
+
+#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
+
+    const int8_t * s = bq5_K->scales;
+
+    const float d = bq5_K->d;
+
+    const float d8_1 = __low2half(bq8_1[0].ds);
+    const float d8_2 = __low2half(bq8_1[1].ds);
+
+    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
+    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
+    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
+    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
+
+    const int * ql = (const int *)bq5_K->qs + (iqs/2);
+    const int vl1 = ql[0];
+    const int vl2 = ql[4];
+
+    const int step = 4 * (iqs/2); // 0, 4, 8, 12
+    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
+    const int in = step%8; // 0, 4, 0, 4
+    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
+
+    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
+    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
+    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
+    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
+
+    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
+                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
+
+    return d * sumf_d;
+
+#else
+    assert(false);
+    return 0.0f; // only to satisfy the compiler
+#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+
+#endif
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI5_K; // == k if QK_K == 256
+
+    const block_q5_K * bx0 = (const block_q5_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR5_K*kqsx;
+
+        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
+        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
+        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
+
+        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
+        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
+
+        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
+        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
+        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+#if QK_K == 256
+        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
+#endif
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
+
+        const int * scales = (const int *) bxi->scales;
+
+        const int ksc = k % (WARP_SIZE/8);
+
+        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
+        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
+        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
+    }
+}
+
+static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
+
+    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
+    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
+    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
+                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
+}
+
+static __dpct_inline__ float
+vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
+                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
+
+    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
+
+    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
+    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
+    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
+
+    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
+    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
+
+    const int8_t * scales = bq6_K->scales + scale_offset;
+
+    int    u[QR6_K];
+    float d8[QR6_K];
+
+#pragma unroll
+    for (int i = 0; i < QR6_K; ++i) {
+        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
+        d8[i] = bq8_1[bq8_offset + 2 * i].ds[1];
+    }
+
+    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
+}
+
+template <int mmq_y>
+static __dpct_inline__ void
+allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
+                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
+    (void)x_qh;
+
+    *x_ql = tile_x_ql;
+    *x_dm = tile_x_dm;
+    *x_sc = tile_x_sc;
+}
+
+template <int mmq_y, int nwarps, bool need_check>
+static __dpct_inline__ void
+load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
+                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
+                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
+                const int &k, const int &blocks_per_row) {
+    (void)x_qh;
+
+    GGML_CUDA_ASSUME(i_offset >= 0);
+    GGML_CUDA_ASSUME(i_offset <  nwarps);
+    GGML_CUDA_ASSUME(k >= 0);
+    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+
+    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
+    const int kqsx = k % QI6_K; // == k if QK_K == 256
+
+    const block_q6_K * bx0 = (const block_q6_K *) vx;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
+        int i = i0 + i_offset;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
+        const int ky = QR6_K*kqsx;
+
+        const int ql = get_int_from_uint8(bxi->ql, kqsx);
+        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
+        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
+
+        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
+        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
+        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
+
+        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
+        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
+
+        x_ql[i * (2 * WARP_SIZE + 1) + kq0] =
+            dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020,
+                                                 dpct::sub_sat());
+        x_ql[i * (2 * WARP_SIZE + 1) + kq1] =
+            dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020,
+                                                 dpct::sub_sat());
+    }
+
+    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
+    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
+    float * x_dmf = (float *) x_dm;
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
+        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
+
+        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
+    }
+
+#pragma unroll
+    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
+        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
+
+        if (need_check) {
+            i = sycl::min(i, i_max);
+        }
+
+        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
+
+        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
+    }
+}
+
+static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
+    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
+    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
+    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
+    const int &i, const int &j, const int &k) {
+    (void)x_qh;
+
+    const float * x_dmf = (const float *) x_dm;
+    const float * y_df  = (const float *) y_ds;
+
+    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
+
+    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
+    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
+    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
+}
+
+template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
+          int mmq_y, int nwarps, allocate_tiles_cuda_t allocate_tiles,
+          load_tiles_cuda_t load_tiles, int vdr,
+          vec_dot_q_mul_mat_cuda_t vec_dot>
+/*
+DPCT1110:19: The total declared local variable size in device function mul_mat_q
+exceeds 128 bytes and may cause high register pressure. Consult with your
+hardware vendor to find the total register size available and adjust the code,
+or use smaller sub-group size to avoid high register pressure.
+*/
+static __dpct_inline__ void
+mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
+          float *__restrict__ dst, const int ncols_x, const int nrows_x,
+          const int ncols_y, const int nrows_y, const int nrows_dst,
+          const sycl::nd_item<3> &item_ct1, int *tile_y_qs,
+          sycl::half2 *tile_y_ds) {
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    const int blocks_per_row_x = ncols_x / qk;
+    const int blocks_per_col_y = nrows_y / QK8_1;
+    const int blocks_per_warp = WARP_SIZE / qi;
+
+    const int & ncols_dst = ncols_y;
+
+    const int row_dst_0 = item_ct1.get_group(2) * mmq_y;
+    const int & row_x_0 = row_dst_0;
+
+    const int col_dst_0 = item_ct1.get_group(1) * mmq_x;
+    const int & col_y_0 = col_dst_0;
+
+    int   * tile_x_ql = nullptr;
+    sycl::half2 *tile_x_dm = nullptr;
+    int   * tile_x_qh = nullptr;
+    int   * tile_x_sc = nullptr;
+
+    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+
+    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
+
+    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
+
+        load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
+                   tile_x_qh, tile_x_sc, item_ct1.get_local_id(1),
+                   nrows_x - row_x_0 - 1, item_ct1.get_local_id(2),
+                   blocks_per_row_x);
+
+#pragma unroll
+        for (int ir = 0; ir < qr; ++ir) {
+            const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2);
+            const int kbxd = kqs / QI8_1;
+
+#pragma unroll
+            for (int i = 0; i < mmq_x; i += nwarps) {
+                const int col_y_eff = dpct::min(
+                    (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i),
+                    ncols_y - 1); // to prevent out-of-bounds memory accesses
+
+                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
+
+                const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE +
+                                    kqs % WARP_SIZE;
+                tile_y_qs[index_y] = get_int_from_int8_aligned(
+                    by0->qs, item_ct1.get_local_id(2) % QI8_1);
+            }
+
+#pragma unroll
+            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
+                const int ids =
+                    (ids0 + item_ct1.get_local_id(1) * QI8_1 +
+                     item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) %
+                    mmq_x;
+                const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1);
+                const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1);
+
+                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
+                const sycl::half2 *dsi_src =
+                    &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) +
+                       ir * (WARP_SIZE / QI8_1) + kby]
+                         .ds;
+                sycl::half2 *dsi_dst =
+                    &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby];
+                if (need_sum) {
+                    *dsi_dst = *dsi_src;
+                } else {
+                    float * dfi_dst = (float *) dsi_dst;
+                    *dfi_dst = (*dsi_src)[1];
+                }
+            }
+
+            /*
+            DPCT1118:20: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:85: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+
+// #pragma unroll // unrolling this loop causes too much register pressure
+            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
+#pragma unroll
+                for (int j = 0; j < mmq_x; j += nwarps) {
+#pragma unroll
+                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+                        sum[i / WARP_SIZE][j / nwarps] += vec_dot(
+                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
+                            tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i,
+                            item_ct1.get_local_id(1) + j, k);
+                    }
+                }
+            }
+
+            /*
+            DPCT1118:21: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:86: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+        }
+    }
+
+#pragma unroll
+    for (int j = 0; j < mmq_x; j += nwarps) {
+        const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1);
+
+        if (col_dst >= ncols_dst) {
+            return;
+        }
+
+#pragma unroll
+        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
+            const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i;
+
+            if (row_dst >= nrows_dst) {
+                continue;
+            }
+
+            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
+        }
+    }
+}
+
+#define  MMQ_X_Q4_0_RDNA2  64
+#define  MMQ_Y_Q4_0_RDNA2  128
+#define NWARPS_Q4_0_RDNA2  8
+#define  MMQ_X_Q4_0_RDNA1  64
+#define  MMQ_Y_Q4_0_RDNA1  64
+#define NWARPS_Q4_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_0_AMPERE 4
+#define  MMQ_Y_Q4_0_AMPERE 32
+#define NWARPS_Q4_0_AMPERE 4
+#else
+#define  MMQ_X_Q4_0_AMPERE 64
+#define  MMQ_Y_Q4_0_AMPERE 128
+#define NWARPS_Q4_0_AMPERE 4
+#endif
+#define  MMQ_X_Q4_0_PASCAL 64
+#define  MMQ_Y_Q4_0_PASCAL 64
+#define NWARPS_Q4_0_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q4_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+    const int nwarps = NWARPS_Q4_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+    const int nwarps = NWARPS_Q4_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+    const int nwarps = NWARPS_Q4_0_AMPERE;
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+    const int nwarps = NWARPS_Q4_0_PASCAL;
+
+    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
+        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_0_q8_1_mul_mat;
+    /*
+    DPCT1007:87: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_1_RDNA2  64
+#define  MMQ_Y_Q4_1_RDNA2  128
+#define NWARPS_Q4_1_RDNA2  8
+#define  MMQ_X_Q4_1_RDNA1  64
+#define  MMQ_Y_Q4_1_RDNA1  64
+#define NWARPS_Q4_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_1_AMPERE 4
+#define  MMQ_Y_Q4_1_AMPERE 32
+#define NWARPS_Q4_1_AMPERE 4
+#else
+#define  MMQ_X_Q4_1_AMPERE 64
+#define  MMQ_Y_Q4_1_AMPERE 128
+#define NWARPS_Q4_1_AMPERE 4
+#endif
+#define  MMQ_X_Q4_1_PASCAL 64
+#define  MMQ_Y_Q4_1_PASCAL 64
+#define NWARPS_Q4_1_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+    const int nwarps = NWARPS_Q4_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+    const int nwarps = NWARPS_Q4_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+    const int nwarps = NWARPS_Q4_1_AMPERE;
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+    const int nwarps = NWARPS_Q4_1_PASCAL;
+
+    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
+        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_1_q8_1_mul_mat;
+    /*
+    DPCT1007:88: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_0_RDNA2  64
+#define  MMQ_Y_Q5_0_RDNA2  128
+#define NWARPS_Q5_0_RDNA2  8
+#define  MMQ_X_Q5_0_RDNA1  64
+#define  MMQ_Y_Q5_0_RDNA1  64
+#define NWARPS_Q5_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_0_AMPERE 4
+#define  MMQ_Y_Q5_0_AMPERE 32
+#define NWARPS_Q5_0_AMPERE 4
+#else
+#define  MMQ_X_Q5_0_AMPERE 128
+#define  MMQ_Y_Q5_0_AMPERE 64
+#define NWARPS_Q5_0_AMPERE 4
+#endif
+#define  MMQ_X_Q5_0_PASCAL 64
+#define  MMQ_Y_Q5_0_PASCAL 64
+#define NWARPS_Q5_0_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q5_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+    const int nwarps = NWARPS_Q5_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+    const int nwarps = NWARPS_Q5_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+    const int nwarps = NWARPS_Q5_0_AMPERE;
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+    const int nwarps = NWARPS_Q5_0_PASCAL;
+
+    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
+        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_0_q8_1_mul_mat;
+    /*
+    DPCT1007:89: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_1_RDNA2  64
+#define  MMQ_Y_Q5_1_RDNA2  128
+#define NWARPS_Q5_1_RDNA2  8
+#define  MMQ_X_Q5_1_RDNA1  64
+#define  MMQ_Y_Q5_1_RDNA1  64
+#define NWARPS_Q5_1_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_1_AMPERE 4
+#define  MMQ_Y_Q5_1_AMPERE 32
+#define NWARPS_Q5_1_AMPERE 4
+#else
+#define  MMQ_X_Q5_1_AMPERE 128
+#define  MMQ_Y_Q5_1_AMPERE 64
+#define NWARPS_Q5_1_AMPERE 4
+#endif
+#define  MMQ_X_Q5_1_PASCAL 64
+#define  MMQ_Y_Q5_1_PASCAL 64
+#define NWARPS_Q5_1_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_1(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+    const int nwarps = NWARPS_Q5_1_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+    const int nwarps = NWARPS_Q5_1_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+    const int nwarps = NWARPS_Q5_1_AMPERE;
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+    const int nwarps = NWARPS_Q5_1_PASCAL;
+
+    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
+        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_1_q8_1_mul_mat;
+    /*
+    DPCT1007:90: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q8_0_RDNA2  64
+#define  MMQ_Y_Q8_0_RDNA2  128
+#define NWARPS_Q8_0_RDNA2  8
+#define  MMQ_X_Q8_0_RDNA1  64
+#define  MMQ_Y_Q8_0_RDNA1  64
+#define NWARPS_Q8_0_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q8_0_AMPERE 4
+#define  MMQ_Y_Q8_0_AMPERE 32
+#define NWARPS_Q8_0_AMPERE 4
+#else
+#define  MMQ_X_Q8_0_AMPERE 128
+#define  MMQ_Y_Q8_0_AMPERE 64
+#define NWARPS_Q8_0_AMPERE 4
+#endif
+#define  MMQ_X_Q8_0_PASCAL 64
+#define  MMQ_Y_Q8_0_PASCAL 64
+#define NWARPS_Q8_0_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    mul_mat_q8_0(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+    const int nwarps = NWARPS_Q8_0_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+    const int nwarps = NWARPS_Q8_0_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+    const int nwarps = NWARPS_Q8_0_AMPERE;
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+    const int nwarps = NWARPS_Q8_0_PASCAL;
+
+    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
+        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q8_0_q8_1_mul_mat;
+    /*
+    DPCT1007:91: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q2_K_RDNA2  64
+#define  MMQ_Y_Q2_K_RDNA2  128
+#define NWARPS_Q2_K_RDNA2  8
+#define  MMQ_X_Q2_K_RDNA1  128
+#define  MMQ_Y_Q2_K_RDNA1  32
+#define NWARPS_Q2_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q2_K_AMPERE 4
+#define  MMQ_Y_Q2_K_AMPERE 32
+#define NWARPS_Q2_K_AMPERE 4
+#else
+#define  MMQ_X_Q2_K_AMPERE 64
+#define  MMQ_Y_Q2_K_AMPERE 128
+#define NWARPS_Q2_K_AMPERE 4
+#endif
+#define  MMQ_X_Q2_K_PASCAL 64
+#define  MMQ_Y_Q2_K_PASCAL 64
+#define NWARPS_Q2_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q2_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+    const int nwarps = NWARPS_Q2_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+    const int nwarps = NWARPS_Q2_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+    const int nwarps = NWARPS_Q2_K_AMPERE;
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+    const int nwarps = NWARPS_Q2_K_PASCAL;
+
+    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
+        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q2_K_q8_1_mul_mat;
+    /*
+    DPCT1007:92: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q3_K_RDNA2  128
+#define  MMQ_Y_Q3_K_RDNA2  64
+#define NWARPS_Q3_K_RDNA2  8
+#define  MMQ_X_Q3_K_RDNA1  32
+#define  MMQ_Y_Q3_K_RDNA1  128
+#define NWARPS_Q3_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q3_K_AMPERE 4
+#define  MMQ_Y_Q3_K_AMPERE 32
+#define NWARPS_Q3_K_AMPERE 4
+#else
+#define  MMQ_X_Q3_K_AMPERE 128
+#define  MMQ_Y_Q3_K_AMPERE 128
+#define NWARPS_Q3_K_AMPERE 4
+#endif
+#define  MMQ_X_Q3_K_PASCAL 64
+#define  MMQ_Y_Q3_K_PASCAL 64
+#define NWARPS_Q3_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q3_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+    const int nwarps = NWARPS_Q3_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+    const int nwarps = NWARPS_Q3_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+    const int nwarps = NWARPS_Q3_K_AMPERE;
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+    const int nwarps = NWARPS_Q3_K_PASCAL;
+
+    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
+        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q3_K_q8_1_mul_mat;
+    /*
+    DPCT1007:93: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q4_K_RDNA2  64
+#define  MMQ_Y_Q4_K_RDNA2  128
+#define NWARPS_Q4_K_RDNA2  8
+#define  MMQ_X_Q4_K_RDNA1  32
+#define  MMQ_Y_Q4_K_RDNA1  64
+#define NWARPS_Q4_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q4_K_AMPERE 4
+#define  MMQ_Y_Q4_K_AMPERE 32
+#define NWARPS_Q4_K_AMPERE 4
+#else
+#define  MMQ_X_Q4_K_AMPERE 64
+#define  MMQ_Y_Q4_K_AMPERE 128
+#define NWARPS_Q4_K_AMPERE 4
+#endif
+#define  MMQ_X_Q4_K_PASCAL 64
+#define  MMQ_Y_Q4_K_PASCAL 64
+#define NWARPS_Q4_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q4_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+    const int nwarps = NWARPS_Q4_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+    const int nwarps = NWARPS_Q4_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+    const int nwarps = NWARPS_Q4_K_AMPERE;
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+    const int nwarps = NWARPS_Q4_K_PASCAL;
+
+    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
+        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q4_K_q8_1_mul_mat;
+    /*
+    DPCT1007:94: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q5_K_RDNA2  64
+#define  MMQ_Y_Q5_K_RDNA2  128
+#define NWARPS_Q5_K_RDNA2  8
+#define  MMQ_X_Q5_K_RDNA1  32
+#define  MMQ_Y_Q5_K_RDNA1  64
+#define NWARPS_Q5_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q5_K_AMPERE 4
+#define  MMQ_Y_Q5_K_AMPERE 32
+#define NWARPS_Q5_K_AMPERE 4
+#else
+#define  MMQ_X_Q5_K_AMPERE 64
+#define  MMQ_Y_Q5_K_AMPERE 128
+#define NWARPS_Q5_K_AMPERE 4
+#endif
+#define  MMQ_X_Q5_K_PASCAL 64
+#define  MMQ_Y_Q5_K_PASCAL 64
+#define NWARPS_Q5_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+mul_mat_q5_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+    const int nwarps = NWARPS_Q5_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+    const int nwarps = NWARPS_Q5_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+    const int nwarps = NWARPS_Q5_K_AMPERE;
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+    const int nwarps = NWARPS_Q5_K_PASCAL;
+
+    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
+        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q5_K_q8_1_mul_mat;
+    /*
+    DPCT1007:95: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+#define  MMQ_X_Q6_K_RDNA2  64
+#define  MMQ_Y_Q6_K_RDNA2  128
+#define NWARPS_Q6_K_RDNA2  8
+#define  MMQ_X_Q6_K_RDNA1  32
+#define  MMQ_Y_Q6_K_RDNA1  64
+#define NWARPS_Q6_K_RDNA1  8
+#if defined(CUDA_USE_TENSOR_CORES)
+#define  MMQ_X_Q6_K_AMPERE 4
+#define  MMQ_Y_Q6_K_AMPERE 32
+#define NWARPS_Q6_K_AMPERE 4
+#else
+#define  MMQ_X_Q6_K_AMPERE 64
+#define  MMQ_Y_Q6_K_AMPERE 64
+#define NWARPS_Q6_K_AMPERE 4
+#endif
+#define  MMQ_X_Q6_K_PASCAL 64
+#define  MMQ_Y_Q6_K_PASCAL 64
+#define NWARPS_Q6_K_PASCAL 8
+
+template <bool need_check> static void
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
+#endif // defined(RDNA3) || defined(RDNA2)
+#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
+
+#endif // __CUDA_ARCH__ < CC_VOLTA
+    mul_mat_q6_K(
+    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+#if defined(RDNA3) || defined(RDNA2)
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+    const int nwarps = NWARPS_Q6_K_RDNA2;
+#else
+    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
+    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+    const int nwarps = NWARPS_Q6_K_RDNA1;
+#endif // defined(RDNA3) || defined(RDNA2)
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
+    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+    const int nwarps = NWARPS_Q6_K_AMPERE;
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+
+#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
+    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
+    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+    const int nwarps = NWARPS_Q6_K_PASCAL;
+
+    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
+        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
+        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
+#else
+    (void) vec_dot_q6_K_q8_1_mul_mat;
+    /*
+    DPCT1007:96: Migration of __assert_fail is not supported.
+    */
+    assert(false);
+#endif // __CUDA_ARCH__ >= CC_VOLTA
+}
+
+template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
+                          const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int blocks_per_row = ncols / qk;
+    const int blocks_per_warp = vdr * WARP_SIZE / qi;
+
+// partial sum for each thread
+    float tmp = 0.0f;
+
+    const block_q_t  * x = (const block_q_t  *) vx;
+    const block_q8_1 * y = (const block_q8_1 *) vy;
+
+    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
+        const int ibx = row * blocks_per_row + i +
+                        item_ct1.get_local_id(2) / (qi / vdr); // x block index
+
+        const int iby = (i + item_ct1.get_local_id(2) / (qi / vdr)) *
+                        (qk / QK8_1); // y block index that aligns with ibx
+
+        const int iqs =
+            vdr *
+            (item_ct1.get_local_id(2) %
+             (qi / vdr)); // x block quant index when casting the quants to int
+
+        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:22: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[row] = tmp;
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
+                                   const sycl::nd_item<3> &item_ct1) {
+    // qk = quantized weights per x block
+    // qr = number of quantized weights per data value in x block
+    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
+                    item_ct1.get_local_id(1);
+
+    if (row >= nrows) {
+        return;
+    }
+
+    const int tid = item_ct1.get_local_id(2);
+
+    const int iter_stride = 2*GGML_CUDA_DMMV_X;
+    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+// partial sum for each thread
+#ifdef GGML_CUDA_F16
+    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
+#else
+    float tmp = 0.0f;
+#endif // GGML_CUDA_F16
+
+    for (int i = 0; i < ncols; i += iter_stride) {
+        const int col = i + vals_per_iter*tid;
+        const int ib = (row*ncols + col)/qk; // x block index
+        const int iqs = (col%qk)/qr; // x quant index
+        const int iybs = col - col%qk; // y block start index
+
+// processing >2 values per i iter is faster for fast GPUs
+#pragma unroll
+        for (int j = 0; j < vals_per_iter; j += 2) {
+            // process 2 vals per j iter
+
+            // dequantize
+            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
+            dfloat2 v;
+            dequantize_kernel(vx, ib, iqs + j/qr, v);
+
+            // matrix multiplication
+            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
+#ifdef GGML_CUDA_F16
+            tmp += __hmul2(v, {
+                y[iybs + iqs + j/qr + 0],
+                y[iybs + iqs + j/qr + y_offset]
+            });
+#else
+            tmp += v.x() * y[iybs + iqs + j / qr + 0];
+            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
+#endif // GGML_CUDA_F16
+        }
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:23: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (tid == 0) {
+#ifdef GGML_CUDA_F16
+        dst[row] = tmp.x + tmp.y;
+#else
+        dst[row] = tmp;
+#endif // GGML_CUDA_F16
+    }
+}
+
+static void mul_mat_p021_f16_f32(
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
+    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
+    const sycl::nd_item<3> &item_ct1) {
+
+    const sycl::half *x = (const sycl::half *)vx;
+
+    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                      item_ct1.get_local_id(1);
+    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                        item_ct1.get_local_id(0);
+    const int channel_x = channel / (nchannels_y / nchannels_x);
+
+    const int nrows_y = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst = row_x;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x;
+         col_x0 += item_ct1.get_local_range(2)) {
+        const int col_x = col_x0 + item_ct1.get_local_id(2);
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        // x is transposed and permuted
+        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
+        const float xi =
+            sycl::vec<sycl::half, 1>{x[ix]}
+                .convert<float, sycl::rounding_mode::automatic>()[0];
+
+        const int row_y = col_x;
+
+
+        // y is not transposed but permuted
+        const int iy = channel*nrows_y + row_y;
+
+        tmp += xi * y[iy];
+    }
+
+    // dst is not transposed and not permuted
+    const int idst = channel*nrows_dst + row_dst;
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:24: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
+    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
+    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
+    const sycl::nd_item<3> &item_ct1) {
+
+    const sycl::half *x = (const sycl::half *)vx;
+
+    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                      item_ct1.get_local_id(1);
+    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
+                        item_ct1.get_local_id(0);
+    const int channel_x = channel / channel_x_divisor;
+
+    const int nrows_y   = ncols_x;
+    const int nrows_dst = nrows_x;
+    const int row_dst   = row_x;
+
+    const int idst = channel*nrows_dst + row_dst;
+
+    float tmp = 0.0f;
+
+    for (int col_x0 = 0; col_x0 < ncols_x;
+         col_x0 += item_ct1.get_local_range(2)) {
+        const int col_x = col_x0 + item_ct1.get_local_id(2);
+
+        if (col_x >= ncols_x) {
+            break;
+        }
+
+        const int row_y = col_x;
+
+        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
+        const int iy = channel*nrows_y + row_y;
+
+        const float xi =
+            sycl::vec<sycl::half, 1>{x[ix]}
+                .convert<float, sycl::rounding_mode::automatic>()[0];
+
+        tmp += xi * y[iy];
+    }
+
+    // sum up partial sums and write back result
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1) {
+        /*
+        DPCT1023:25: The SYCL sub-group does not support mask options for
+        dpct::permute_sub_group_by_xor. You can specify
+        "--use-experimental-features=masked-sub-group-operation" to use the
+        experimental helper function to migrate __shfl_xor_sync.
+        */
+        tmp +=
+            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
+    }
+
+    if (item_ct1.get_local_id(2) == 0) {
+        dst[idst] = tmp;
+    }
+}
+
+static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    float * dsti = (float *) cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    sycl::half *dsti = (sycl::half *)cdsti;
+
+    *dsti = sycl::vec<float, 1>{(*xi)}
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+}
+
+static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
+    const sycl::half *xi = (const sycl::half *)cxi;
+    sycl::half *dsti = (sycl::half *)cdsti;
+
+    *dsti = *xi;
+}
+
+template <cpy_kernel_t cpy_1>
+static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
+                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
+                                   const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= ne) {
+        return;
+    }
+
+    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
+    // then combine those indices with the corresponding byte offsets to get the total offsets
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = i - i02*ne01*ne00 - i01*ne00;
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = i - i12*ne10*ne11 - i11*ne10;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_1(cx + x_offset, cdst + dst_offset);
+}
+
+static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q8_0 * dsti = (block_q8_0 *) cdsti;
+
+    float amax = 0.0f; // absolute max
+
+    for (int j = 0; j < QK8_0; j++) {
+        const float v = xi[j];
+        amax = sycl::fmax(amax, sycl::fabs((float)v));
+    }
+
+    const float d = amax / ((1 << 7) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK8_0; ++j) {
+        const float x0 = xi[j]*id;
+
+        dsti->qs[j] = sycl::round((float)x0);
+    }
+}
+
+static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_0 * dsti = (block_q4_0 *) cdsti;
+
+    float amax = 0.0f;
+    float vmax = 0.0f;
+
+    for (int j = 0; j < QK4_0; ++j) {
+        const float v = xi[j];
+        if (amax < sycl::fabs((float)v)) {
+            amax = sycl::fabs((float)v);
+            vmax = v;
+        }
+    }
+
+    const float d  = vmax / -8;
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->d = d;
+
+    for (int j = 0; j < QK4_0/2; ++j) {
+        const float x0 = xi[0       + j]*id;
+        const float x1 = xi[QK4_0/2 + j]*id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
+    const float * xi = (const float *) cxi;
+    block_q4_1 * dsti = (block_q4_1 *) cdsti;
+
+    float vmin = FLT_MAX;
+    float vmax = -FLT_MAX;
+
+    for (int j = 0; j < QK4_1; ++j) {
+        const float v = xi[j];
+
+        if (v < vmin) vmin = v;
+        if (v > vmax) vmax = v;
+    }
+
+    const float d  = (vmax - vmin) / ((1 << 4) - 1);
+    const float id = d ? 1.0f/d : 0.0f;
+
+    dsti->dm.x() = d;
+    dsti->dm.y() = vmin;
+
+    for (int j = 0; j < QK4_1/2; ++j) {
+        const float x0 = (xi[0       + j] - vmin)*id;
+        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
+
+        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f));
+        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f));
+
+        dsti->qs[j]  = xi0;
+        dsti->qs[j] |= xi1 << 4;
+    }
+}
+
+template <cpy_kernel_t cpy_blck, int qk>
+static void cpy_f32_q(const char * cx, char * cdst, const int ne,
+                                 const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
+                                 const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
+                                 const sycl::nd_item<3> &item_ct1) {
+    const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                   item_ct1.get_local_id(2)) *
+                  qk;
+
+    if (i >= ne) {
+        return;
+    }
+
+    const int i02 = i / (ne00*ne01);
+    const int i01 = (i - i02*ne01*ne00) / ne00;
+    const int i00 = (i - i02*ne01*ne00 - i01*ne00);
+    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
+
+    const int i12 = i / (ne10*ne11);
+    const int i11 = (i - i12*ne10*ne11) / ne10;
+    const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
+    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
+
+    cpy_blck(cx + x_offset, cdst + dst_offset);
+}
+
+static float rope_yarn_ramp(const float low, const float high, const int i0) {
+    const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
+    return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
+}
+
+struct rope_corr_dims {
+    float v[4];
+};
+
+// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
+// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
+static void rope_yarn(
+    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
+    float * cos_theta, float * sin_theta
+) {
+    // Get n-d rotational scaling corrected for extrapolation
+    float theta_interp = freq_scale * theta_extrap;
+    float theta = theta_interp;
+    if (ext_factor != 0.0f) {
+        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
+        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
+
+        // Get n-d magnitude scaling corrected for interpolation
+        mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
+    }
+    *cos_theta = sycl::cos(theta) * mscale;
+    *sin_theta = sycl::sin(theta) * mscale;
+}
+
+// rope == RoPE == rotary positional embedding
+template<typename T, bool has_pos>
+static void rope(
+    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims
+,
+    const sycl::nd_item<3> &item_ct1) {
+    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                         item_ct1.get_local_id(1));
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + 1];
+
+    dst[i + 0] = x0*cos_theta - x1*sin_theta;
+    dst[i + 1] = x0*sin_theta + x1*cos_theta;
+}
+
+template<typename T, bool has_pos>
+static void rope_neox(
+    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
+    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
+,
+    const sycl::nd_item<3> &item_ct1) {
+    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                         item_ct1.get_local_id(1));
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int ib = col / n_dims;
+    const int ic = col % n_dims;
+
+    const int i = row*ncols + ib*n_dims + ic/2;
+    const int i2 = row/p_delta_rows;
+
+    float cur_rot = inv_ndims * ic - ib;
+
+    const int p = has_pos ? pos[i2] : 0;
+    const float theta_base =
+        p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
+
+    float cos_theta, sin_theta;
+    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + n_dims/2];
+
+    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
+    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+}
+
+static void rope_glm_f32(
+    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
+    int n_ctx
+, const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+    const int half_n_dims = ncols/4;
+
+    if (col >= half_n_dims) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i = row*ncols + col;
+    const int i2 = row/p_delta_rows;
+
+    const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
+     // FIXME: this is likely wrong
+    const int p = pos != nullptr ? pos[i2] : 0;
+
+    const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
+    const float sin_theta = sycl::sin((float)theta);
+    const float cos_theta = sycl::cos((float)theta);
+
+    const float x0 = x[i + 0];
+    const float x1 = x[i + half_n_dims];
+
+    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
+    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
+
+    const float block_theta =
+        ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
+    const float sin_block_theta = sycl::sin((float)block_theta);
+    const float cos_block_theta = sycl::cos((float)block_theta);
+
+    const float x2 = x[i + half_n_dims * 2];
+    const float x3 = x[i + half_n_dims * 3];
+
+    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
+    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
+}
+
+static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
+                                 const int n_heads_log2_floor, const float m0, const float m1,
+                                 const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int i = row*ncols + col;
+
+    const int k = row/k_rows;
+
+    float m_k;
+    if (k < n_heads_log2_floor) {
+        m_k = dpct::pow(m0, k + 1);
+    } else {
+        m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
+    }
+
+    dst[i] = col * m_k + x[i];
+}
+
+static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int row = item_ct1.get_group(1);
+    const int col = item_ct1.get_local_id(2);
+
+    float sum = 0.0f;
+    for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) {
+        sum += x[row * ncols + i];
+    }
+
+    sum = warp_reduce_sum(sum, item_ct1);
+
+    if (col == 0) {
+        dst[row] = sum;
+    }
+}
+
+template<typename T>
+static inline void swap(T & a, T & b) {
+    T tmp = a;
+    a = b;
+    b = tmp;
+}
+
+template<ggml_sort_order order>
+static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
+                              const sycl::nd_item<3> &item_ct1) {
+    // bitonic sort
+    int col = item_ct1.get_local_id(2);
+    int row = item_ct1.get_group(1);
+
+    if (col >= ncols) return;
+
+    const float * x_row = x + row * ncols;
+    int * dst_row = dst + row * ncols;
+
+    // initialize indices
+    if (col < ncols) {
+        dst_row[col] = col;
+    }
+    /*
+    DPCT1065:97: Consider replacing sycl::nd_item::barrier() with
+    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
+    performance if there is no access to global memory.
+    */
+    item_ct1.barrier();
+
+    for (int k = 2; k <= ncols; k *= 2) {
+        for (int j = k / 2; j > 0; j /= 2) {
+            int ixj = col ^ j;
+            if (ixj > col) {
+                if ((col & k) == 0) {
+                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
+                        swap(dst_row[col], dst_row[ixj]);
+                    }
+                } else {
+                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
+                        swap(dst_row[col], dst_row[ixj]);
+                    }
+                }
+            }
+            /*
+            DPCT1118:26: SYCL group functions and algorithms must be encountered
+            in converged control flow. You may need to adjust the code.
+            */
+            /*
+            DPCT1065:98: Consider replacing sycl::nd_item::barrier() with
+            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+            better performance if there is no access to global memory.
+            */
+            item_ct1.barrier();
+        }
+    }
+}
+
+static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
+                              const sycl::nd_item<3> &item_ct1) {
+    const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
+                    item_ct1.get_local_id(1);
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                    item_ct1.get_local_id(2);
+
+    if (col >= ncols) {
+        return;
+    }
+
+    const int i = row*ncols + col;
+    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
+    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
+    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
+}
+
+static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
+                         const sycl::nd_item<3> &item_ct1, float *buf) {
+    const int tid = item_ct1.get_local_id(2);
+    const int rowx = item_ct1.get_group(2);
+    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
+
+    const int block_size = item_ct1.get_local_range(2);
+
+    const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
+    const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
+
+    float max_val = -INFINITY;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+        max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
+    }
+
+    // find the max value in the block
+    max_val = warp_reduce_max(max_val, item_ct1);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf[lane_id] = -INFINITY;
+        }
+        /*
+        DPCT1118:27: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:99: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        if (lane_id == 0) {
+            buf[warp_id] = max_val;
+        }
+        /*
+        DPCT1118:28: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:100: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        max_val = buf[lane_id];
+        max_val = warp_reduce_max(max_val, item_ct1);
+    }
+
+    float tmp = 0.f;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int ix = rowx*ncols + col;
+        const int iy = rowy*ncols + col;
+        const float val =
+            sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
+        tmp += val;
+        dst[ix] = val;
+    }
+
+    // find the sum of exps in the block
+    tmp = warp_reduce_sum(tmp, item_ct1);
+    if (block_size > WARP_SIZE) {
+        if (warp_id == 0) {
+            buf[lane_id] = 0.f;
+        }
+        /*
+        DPCT1118:29: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:101: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        if (lane_id == 0) {
+            buf[warp_id] = tmp;
+        }
+        /*
+        DPCT1118:30: SYCL group functions and algorithms must be encountered in
+        converged control flow. You may need to adjust the code.
+        */
+        /*
+        DPCT1065:102: Consider replacing sycl::nd_item::barrier() with
+        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
+        better performance if there is no access to global memory.
+        */
+        item_ct1.barrier();
+
+        tmp = buf[lane_id];
+        tmp = warp_reduce_sum(tmp, item_ct1);
+    }
+
+    const float inv_tmp = 1.f / tmp;
+
+    for (int col = tid; col < ncols; col += block_size) {
+        const int i = rowx*ncols + col;
+        dst[i] *= inv_tmp;
+    }
+}
+
+static void scale_f32(const float * x, float * dst, const float scale, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = scale * x[i];
+}
+
+static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k,
+                      const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
+                  item_ct1.get_local_id(2);
+
+    if (i >= k) {
+        return;
+    }
+
+    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
+}
+
+static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
+                           int IW, int IH, int OW, int KW, int KH,
+                           int pelements, int CHW, int s0, int s1, int p0,
+                           int p1, int d0, int d1,
+                           const sycl::nd_item<3> &item_ct1) {
+    const int i = item_ct1.get_local_id(2) +
+                  item_ct1.get_group(2) * item_ct1.get_local_range(2);
+    if (i >= pelements) {
+        return;
+    }
+
+    const int ksize = OW * (KH > 1 ? KW : 1);
+    const int kx = i / ksize;
+    const int kd = kx * ksize;
+    const int ky = (i - kd) / OW;
+    const int ix = i % OW;
+
+    const int iiw = ix * s0 + kx * d0 - p0;
+    const int iih = item_ct1.get_group(1) * s1 + ky * d1 - p1;
+
+    const int offset_dst = (item_ct1.get_group(1) * OW + ix) * CHW +
+                           (item_ct1.get_group(0) * (KW * KH) + ky * KW + kx);
+
+    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
+        dst[offset_dst] =
+            sycl::vec<float, 1>{0.0f}
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+    } else {
+        const int offset_src = item_ct1.get_group(0) * offset_delta;
+        dst[offset_dst] =
+            sycl::vec<float, 1>{x[offset_src + iih * IW + iiw]}
+                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_cuda(const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst, const void *src0_dd,
+                          const int32_t *src1_dd, float *dst_dd,
+                          dpct::queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    GGML_ASSERT(ne00 % 2 == 0);
+
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             k_get_rows<qk, qr, dq>(
+                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+                         });
+
+    (void) dst;
+}
+
+template <typename src0_t>
+static void get_rows_cuda_float(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const src0_t *src0_dd, const int32_t *src1_dd,
+                                float *dst_dd, dpct::queue_ptr stream) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
+    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
+
+    // strides in elements
+    //const size_t s0 = nb0 / ggml_element_size(dst);
+    const size_t s1 = nb1 / ggml_element_size(dst);
+    const size_t s2 = nb2 / ggml_element_size(dst);
+    const size_t s3 = nb3 / ggml_element_size(dst);
+
+    const size_t s10 = nb10 / ggml_element_size(src1);
+    const size_t s11 = nb11 / ggml_element_size(src1);
+    const size_t s12 = nb12 / ggml_element_size(src1);
+    //const size_t s13 = nb13 / ggml_element_size(src1);
+
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
+                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
+            });
+    }
+
+    (void) dst;
+}
+
+template<float (*bin_op)(const float, const float)>
+struct bin_bcast_cuda {
+    template <typename src0_t, typename src1_t, typename dst_t>
+    void operator()(const struct ggml_tensor *src0,
+                    const struct ggml_tensor *src1, struct ggml_tensor *dst,
+                    const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
+                    dpct::queue_ptr stream) {
+
+        GGML_TENSOR_BINARY_OP_LOCALS
+
+        int nr0 = ne10/ne0;
+        int nr1 = ne11/ne1;
+        int nr2 = ne12/ne2;
+        int nr3 = ne13/ne3;
+
+        int nr[4] = { nr0, nr1, nr2, nr3 };
+
+        // collapse dimensions until first broadcast dimension
+        int64_t cne0[] = {ne0, ne1, ne2, ne3};
+        int64_t cne1[] = {ne10, ne11, ne12, ne13};
+        size_t cnb0[] = {nb0, nb1, nb2, nb3};
+        size_t cnb1[] = {nb10, nb11, nb12, nb13};
+        auto collapse = [](int64_t cne[]) {
+            cne[0] *= cne[1];
+            cne[1] = cne[2];
+            cne[2] = cne[3];
+            cne[3] = 1;
+        };
+
+        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
+            cnb[1] *= cne[1];
+            cnb[2] *= cne[2];
+            cnb[3] *= cne[3];
+        };
+
+        for (int i = 0; i < 4; i++) {
+            if (nr[i] != 1) {
+                break;
+            }
+            if (i > 0) {
+                collapse_nb(cnb0, cne0);
+                collapse_nb(cnb1, cne1);
+                collapse(cne0);
+                collapse(cne1);
+            }
+        }
+        {
+            int64_t ne0 = cne0[0];
+            int64_t ne1 = cne0[1];
+            int64_t ne2 = cne0[2];
+            int64_t ne3 = cne0[3];
+
+            int64_t ne10 = cne1[0];
+            int64_t ne11 = cne1[1];
+            int64_t ne12 = cne1[2];
+            int64_t ne13 = cne1[3];
+
+            size_t nb0 = cnb0[0];
+            size_t nb1 = cnb0[1];
+            size_t nb2 = cnb0[2];
+            size_t nb3 = cnb0[3];
+
+            size_t nb10 = cnb1[0];
+            size_t nb11 = cnb1[1];
+            size_t nb12 = cnb1[2];
+            size_t nb13 = cnb1[3];
+
+            size_t s0 = nb0 / sizeof(dst_t);
+            size_t s1 = nb1 / sizeof(dst_t);
+            size_t s2 = nb2 / sizeof(dst_t);
+            size_t s3 = nb3 / sizeof(dst_t);
+
+            size_t s10 = nb10 / sizeof(src1_t);
+            size_t s11 = nb11 / sizeof(src1_t);
+            size_t s12 = nb12 / sizeof(src1_t);
+            size_t s13 = nb13 / sizeof(src1_t);
+
+            GGML_ASSERT(s0 == 1);
+            GGML_ASSERT(s10 == 1);
+
+            const int block_size = 128;
+
+            int64_t hne0 = std::max(ne0/2LL, 1LL);
+
+            sycl::range<3> block_dims(1, 1, 1);
+            block_dims[2] = std::min<unsigned int>(hne0, block_size);
+            block_dims[1] =
+                std::min<unsigned int>(ne1, block_size / block_dims[2]);
+            block_dims[0] = std::min(
+                std::min<unsigned int>(ne2 * ne3, block_size / block_dims[2] /
+                                                      block_dims[1]),
+                64U);
+
+            sycl::range<3> block_nums(
+                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
+                (ne1 + block_dims[1] - 1) / block_dims[1],
+                (hne0 + block_dims[2] - 1) / block_dims[2]);
+
+            if (block_nums[0] > 65535) {
+                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
+                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
+                {
+                    dpct::has_capability_or_fail(stream->get_device(),
+                                                 {sycl::aspect::fp16});
+                    stream->parallel_for(
+                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
+                                              sycl::range<3>(1, 1, block_size),
+                                          sycl::range<3>(1, 1, block_size)),
+                        [=](sycl::nd_item<3> item_ct1) {
+                            k_bin_bcast_unravel<bin_op>(
+                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
+                                ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
+                                s13, item_ct1);
+                        });
+                }
+            } else {
+                /*
+                DPCT1049:31: The work-group size passed to the SYCL kernel may
+                exceed the limit. To get the device limit, query
+                info::device::max_work_group_size. Adjust the work-group size if
+                needed.
+                */
+                dpct::has_capability_or_fail(stream->get_device(),
+                                             {sycl::aspect::fp16});
+                stream->parallel_for(
+                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                    [=](sycl::nd_item<3> item_ct1) {
+                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
+                                            ne2, ne3, ne10, ne11, ne12, ne13,
+                                            s1, s2, s3, s11, s12, s13,
+                                            item_ct1);
+                    });
+            }
+        }
+    }
+};
+
+static void acc_f32_cuda(const float *x, const float *y, float *dst,
+                         const int n_elements, const int ne10, const int ne11,
+                         const int ne12, const int nb1, const int nb2,
+                         const int offset, dpct::queue_ptr stream) {
+    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
+                    item_ct1);
+        });
+}
+
+static void gelu_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void silu_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            silu_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void gelu_quick_f32_cuda(const float *x, float *dst, const int k,
+                                dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            gelu_quick_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void tanh_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            tanh_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void relu_f32_cuda(const float *x, float *dst, const int k,
+                          dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            relu_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void leaky_relu_f32_cuda(const float *x, float *dst, const int k,
+                                const float negative_slope,
+                                dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
+        });
+}
+
+static void sqr_f32_cuda(const float *x, float *dst, const int k,
+                         dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            sqr_f32(x, dst, k, item_ct1);
+        });
+}
+
+static void norm_f32_cuda(const float *x, float *dst, const int ncols,
+                          const int nrows, const float eps,
+                          dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
+                sycl::range<1>(32), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
+                                            s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    } else {
+        const sycl::range<3> block_dims(1, 1, 1024);
+        /*
+        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
+                sycl::range<1>(32), cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        norm_f32<1024>(x, dst, ncols, eps, item_ct1,
+                                       s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    }
+}
+
+static void group_norm_f32_cuda(const float *x, float *dst,
+                                const int num_groups, const int group_size,
+                                const int ne_elements, dpct::queue_ptr stream) {
+    static const float eps = 1e-6f;
+    if (group_size < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            const float eps_ct4 = eps;
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        group_norm_f32<WARP_SIZE>(
+                            x, dst, group_size, ne_elements, eps_ct4, item_ct1,
+                            s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    } else {
+        const sycl::range<3> block_dims(1, 1, 1024);
+        /*
+        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            const float eps_ct4 = eps;
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        group_norm_f32<1024>(x, dst, group_size, ne_elements,
+                                             eps_ct4, item_ct1,
+                                             s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    }
+}
+
+static void concat_f32_cuda(const float *x, const float *y, float *dst,
+                            const int ne0, int ne1, int ne2, int ne02,
+                            dpct::queue_ptr stream) {
+    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne2, ne1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            concat_f32(x, y, dst, ne0, ne02, item_ct1);
+        });
+}
+
+static void upscale_f32_cuda(const float *x, float *dst, const int ne00,
+                             const int ne01, const int ne02,
+                             const int scale_factor, dpct::queue_ptr stream) {
+    int ne0 = (ne00 * scale_factor);
+    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim *
+                              sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
+        });
+}
+
+static void pad_f32_cuda(const float *x, float *dst, const int ne00,
+                         const int ne01, const int ne02, const int ne0,
+                         const int ne1, const int ne2, dpct::queue_ptr stream) {
+    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+    sycl::range<3> gridDim(ne2, ne1, num_blocks);
+    stream->parallel_for(
+        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
+        });
+}
+
+static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
+                              const int nrows, const float eps,
+                              dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % WARP_SIZE == 0);
+    if (ncols < 1024) {
+        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        rms_norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
+                                                s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    } else {
+        const sycl::range<3> block_dims(1, 1, 1024);
+        /*
+        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->submit([&](sycl::handler &cgh) {
+            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
+                                                         cgh);
+
+            cgh.parallel_for(
+                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
+                                  block_dims),
+                [=](sycl::nd_item<3> item_ct1)
+                    [[intel::reqd_sub_group_size(32)]] {
+                        rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1,
+                                           s_sum_acc_ct1.get_pointer());
+                    });
+        });
+    }
+}
+
+static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx,
+                                   const int ky, const int kx_padded,
+                                   dpct::queue_ptr stream) {
+    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const sycl::range<3> num_blocks(1, ky, block_num_x);
+    const sycl::range<3> block_size(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(num_blocks * block_size, block_size),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
+            });
+    }
+}
+
+template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static void dequantize_block_cuda(const void *__restrict__ vx,
+                                  dst_t *__restrict__ y, const int k,
+                                  dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(
+                sycl::range<3>(1, 1, num_blocks) *
+                    sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE),
+                sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
+            });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q2_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q3_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 32),
+                                               sycl::range<3>(1, 1, 32)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q4_K(vx, y, item_ct1);
+                             });
+    }
+}
+
+template <typename dst_t>
+static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q5_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+template <typename dst_t>
+static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k,
+                                     dpct::queue_ptr stream) {
+    const int nb = k / QK_K;
+#if QK_K == 256
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
+                                                   sycl::range<3>(1, 1, 64),
+                                               sycl::range<3>(1, 1, 64)),
+                             [=](sycl::nd_item<3> item_ct1) {
+                                 dequantize_block_q6_K(vx, y, item_ct1);
+                             });
+    }
+#else
+    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
+#endif
+}
+
+static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_TYPE_Q4_1:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_F32:
+            return dequantize_block_cuda<1, 1, convert_f32>;
+        default:
+            return nullptr;
+    }
+}
+
+static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+        case GGML_TYPE_Q4_1:
+            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+        case GGML_TYPE_Q5_0:
+            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+        case GGML_TYPE_Q5_1:
+            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+        case GGML_TYPE_Q8_0:
+            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+        case GGML_TYPE_Q2_K:
+            return dequantize_row_q2_K_cuda;
+        case GGML_TYPE_Q3_K:
+            return dequantize_row_q3_K_cuda;
+        case GGML_TYPE_Q4_K:
+            return dequantize_row_q4_K_cuda;
+        case GGML_TYPE_Q5_K:
+            return dequantize_row_q5_K_cuda;
+        case GGML_TYPE_Q6_K:
+            return dequantize_row_q6_K_cuda;
+        case GGML_TYPE_F16:
+            return dequantize_block_cuda<1, 1, convert_f16>;
+        default:
+            return nullptr;
+    }
+}
+
+static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
+                    vx, y, dst, ncols, nrows, item_ct1);
+            });
+    }
+}
+
+static void dequantize_mul_mat_vec_q2_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q3_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q4_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q5_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const sycl::range<3> block_dims(1, 1, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
+        });
+}
+
+static void dequantize_mul_mat_vec_q6_K_cuda(const void *vx, const float *y,
+                                             float *dst, const int ncols,
+                                             const int nrows,
+                                             dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int ny = 2 / K_QUANTS_PER_ITERATION;
+    const int block_num_y = (nrows + ny - 1) / ny;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, ny, 32);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
+        });
+}
+
+static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y,
+                                         float *dst, const int ncols,
+                                         const int nrows,
+                                         dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
+                                                          nrows, item_ct1);
+            });
+    }
+}
+
+static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
+                          vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK4_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
+                          vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK5_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
+                          vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK5_1 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
+                          vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK8_0 == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
+                          vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
+                          vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
+                          vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
+                          vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
+                          vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
+                                       float *dst, const int ncols,
+                                       const int nrows,
+                                       dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % QK_K == 0);
+    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const sycl::range<3> block_nums(1, 1, block_num_y);
+    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    stream->parallel_for(
+        sycl::nd_range<3>(block_nums * block_dims, block_dims),
+        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+            mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
+                          vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
+                                             item_ct1);
+        });
+}
+
+static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
+        nwarps = NWARPS_Q4_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
+        nwarps = NWARPS_Q4_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
+        nwarps = NWARPS_Q4_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
+        nwarps = NWARPS_Q4_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
+        nwarps = NWARPS_Q4_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
+        nwarps = NWARPS_Q4_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
+        nwarps = NWARPS_Q4_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
+        nwarps = NWARPS_Q4_1_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
+        nwarps = NWARPS_Q5_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
+        nwarps = NWARPS_Q5_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
+        nwarps = NWARPS_Q5_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
+        nwarps = NWARPS_Q5_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
+        nwarps = NWARPS_Q5_1_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_1_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
+        nwarps = NWARPS_Q5_1_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_1_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
+        nwarps = NWARPS_Q5_1_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_1_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
+        nwarps = NWARPS_Q5_1_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA2;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
+        nwarps = NWARPS_Q8_0_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q8_0_RDNA1;
+        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
+        nwarps = NWARPS_Q8_0_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q8_0_AMPERE;
+        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
+        nwarps = NWARPS_Q8_0_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q8_0_PASCAL;
+        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
+        nwarps = NWARPS_Q8_0_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
+        nwarps = NWARPS_Q2_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q2_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
+        nwarps = NWARPS_Q2_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q2_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
+        nwarps = NWARPS_Q2_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q2_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
+        nwarps = NWARPS_Q2_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:46: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+#if QK_K == 256
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
+        nwarps = NWARPS_Q3_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q3_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
+        nwarps = NWARPS_Q3_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q3_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
+        nwarps = NWARPS_Q3_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q3_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
+        nwarps = NWARPS_Q3_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:48: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+#endif
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
+        nwarps = NWARPS_Q4_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q4_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
+        nwarps = NWARPS_Q4_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q4_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
+        nwarps = NWARPS_Q4_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q4_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
+        nwarps = NWARPS_Q4_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:49: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:50: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
+        nwarps = NWARPS_Q5_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q5_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
+        nwarps = NWARPS_Q5_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q5_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
+        nwarps = NWARPS_Q5_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q5_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
+        nwarps = NWARPS_Q5_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:51: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:52: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
+                                        float *dst, const int ncols_x,
+                                        const int nrows_x, const int ncols_y,
+                                        const int nrows_y, const int nrows_dst,
+                                        dpct::queue_ptr stream) try {
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+    const int compute_capability = g_compute_capabilities[id];
+
+    int mmq_x, mmq_y, nwarps;
+    if (compute_capability >= CC_RDNA2) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA2;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
+        nwarps = NWARPS_Q6_K_RDNA2;
+    } else if (compute_capability >= CC_OFFSET_AMD) {
+        mmq_x  =  MMQ_X_Q6_K_RDNA1;
+        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
+        nwarps = NWARPS_Q6_K_RDNA1;
+    } else if (compute_capability >= CC_VOLTA) {
+        mmq_x  =  MMQ_X_Q6_K_AMPERE;
+        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
+        nwarps = NWARPS_Q6_K_AMPERE;
+    } else if (compute_capability >= MIN_CC_DP4A) {
+        mmq_x  =  MMQ_X_Q6_K_PASCAL;
+        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
+        nwarps = NWARPS_Q6_K_PASCAL;
+    } else {
+        GGML_ASSERT(false);
+    }
+
+    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
+    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
+    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
+    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
+
+    if (nrows_x % mmq_y == 0) {
+        const bool need_check = false;
+        /*
+        DPCT1049:53: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    } else {
+        const bool need_check = true;
+        /*
+        DPCT1049:54: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
+                                         nrows_y, nrows_dst);
+            });
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y,
+                                           float *dst, const int ncols_x,
+                                           const int nrows_x,
+                                           const int nchannels_x,
+                                           const int nchannels_y,
+                                           dpct::queue_ptr stream) {
+
+    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
+                                     nchannels_y, item_ct1);
+            });
+    }
+}
+
+static void ggml_mul_mat_vec_nc_f16_f32_cuda(
+    const void *vx, const float *y, float *dst, const int ncols_x,
+    const int nrows_x, const int row_stride_x, const int nchannels_x,
+    const int nchannels_y, const int channel_stride_x, dpct::queue_ptr stream) {
+
+    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
+                                       row_stride_x, channel_stride_x,
+                                       nchannels_y / nchannels_x, item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_f32_q8_0_cuda(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int nb00, const int nb01,
+                                   const int nb02, const int ne10,
+                                   const int ne11, const int nb10,
+                                   const int nb11, const int nb12,
+                                   dpct::queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK8_0 == 0);
+    const int num_blocks = ne / QK8_0;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
+                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
+                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
+                         });
+}
+
+static void ggml_cpy_f32_q4_0_cuda(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int nb00, const int nb01,
+                                   const int nb02, const int ne10,
+                                   const int ne11, const int nb10,
+                                   const int nb11, const int nb12,
+                                   dpct::queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK4_0 == 0);
+    const int num_blocks = ne / QK4_0;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
+                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
+                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
+                         });
+}
+
+static void ggml_cpy_f32_q4_1_cuda(const char *cx, char *cdst, const int ne,
+                                   const int ne00, const int ne01,
+                                   const int nb00, const int nb01,
+                                   const int nb02, const int ne10,
+                                   const int ne11, const int nb10,
+                                   const int nb11, const int nb12,
+                                   dpct::queue_ptr stream) {
+
+    GGML_ASSERT(ne % QK4_1 == 0);
+    const int num_blocks = ne / QK4_1;
+    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
+                                           sycl::range<3>(1, 1, 1)),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
+                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
+                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
+                         });
+}
+
+static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void scale_f32_cuda(const float *x, float *dst, const float scale,
+                           const int k, dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            scale_f32(x, dst, scale, k, item_ct1);
+        });
+}
+
+static void clamp_f32_cuda(const float *x, float *dst, const float min,
+                           const float max, const int k,
+                           dpct::queue_ptr stream) {
+    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
+    stream->parallel_for(
+        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                              sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)),
+        [=](sycl::nd_item<3> item_ct1) {
+            clamp_f32(x, dst, min, max, k, item_ct1);
+        });
+}
+
+template <typename T>
+static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
+                      const int32_t *pos, float freq_scale, int p_delta_rows,
+                      float freq_base, float ext_factor, float attn_factor,
+                      rope_corr_dims corr_dims, dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % 2 == 0);
+    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
+    if (pos == nullptr) {
+        /*
+        DPCT1049:55: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
+                               freq_base, ext_factor, attn_factor, corr_dims,
+                               item_ct1);
+            });
+    } else {
+        /*
+        DPCT1049:56: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
+                              freq_base, ext_factor, attn_factor, corr_dims,
+                              item_ct1);
+            });
+    }
+}
+
+template <typename T>
+static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
+                           const int32_t *pos, float freq_scale,
+                           int p_delta_rows, float freq_base, float ext_factor,
+                           float attn_factor, rope_corr_dims corr_dims,
+                           dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % 2 == 0);
+    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
+
+    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float inv_ndims = -1.0f / n_dims;
+
+    if (pos == nullptr) {
+        /*
+        DPCT1049:57: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale,
+                                    p_delta_rows, ext_factor, attn_factor,
+                                    corr_dims, theta_scale, inv_ndims,
+                                    item_ct1);
+            });
+    } else {
+        /*
+        DPCT1049:58: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale,
+                                   p_delta_rows, ext_factor, attn_factor,
+                                   corr_dims, theta_scale, inv_ndims, item_ct1);
+            });
+    }
+}
+
+static void rope_glm_f32_cuda(const float *x, float *dst, int ncols, int nrows,
+                              const int32_t *pos, float freq_scale,
+                              int p_delta_rows, float freq_base, int n_ctx,
+                              dpct::queue_ptr stream) {
+    GGML_ASSERT(ncols % 4 == 0);
+    const sycl::range<3> block_dims(1, 1, CUDA_ROPE_BLOCK_SIZE / 4);
+    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
+    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             rope_glm_f32(x, dst, ncols, pos, freq_scale,
+                                          p_delta_rows, freq_base, n_ctx,
+                                          item_ct1);
+                         });
+}
+
+static void alibi_f32_cuda(const float *x, float *dst, const int ncols,
+                           const int nrows, const int k_rows,
+                           const int n_heads_log2_floor, const float m0,
+                           const float m1, dpct::queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, CUDA_ALIBI_BLOCK_SIZE);
+    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
+    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             alibi_f32(x, dst, ncols, k_rows,
+                                       n_heads_log2_floor, m0, m1, item_ct1);
+                         });
+}
+
+static void sum_rows_f32_cuda(const float *x, float *dst, const int ncols,
+                              const int nrows, dpct::queue_ptr stream) {
+    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1)
+                             [[intel::reqd_sub_group_size(32)]] {
+                                 k_sum_rows_f32(x, dst, ncols, item_ct1);
+                             });
+}
+
+static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
+                                 const int nrows, ggml_sort_order order,
+                                 dpct::queue_ptr stream) {
+    // bitonic sort requires ncols to be power of 2
+    GGML_ASSERT((ncols & (ncols - 1)) == 0);
+
+    const sycl::range<3> block_dims(1, 1, ncols);
+    const sycl::range<3> block_nums(1, nrows, 1);
+    if (order == GGML_SORT_ASC) {
+        /*
+        DPCT1049:59: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
+            });
+    } else if (order == GGML_SORT_DESC) {
+        /*
+        DPCT1049:60: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) {
+                k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
+            });
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+static void diag_mask_inf_f32_cuda(const float *x, float *dst,
+                                   const int ncols_x, const int nrows_x,
+                                   const int rows_per_channel, const int n_past,
+                                   dpct::queue_ptr stream) {
+    const sycl::range<3> block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
+    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
+    const sycl::range<3> block_nums(1, block_num_x, nrows_x);
+    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
+                         [=](sycl::nd_item<3> item_ct1) {
+                             diag_mask_inf_f32(x, dst, ncols_x,
+                                               rows_per_channel, n_past,
+                                               item_ct1);
+                         });
+}
+
+static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
+                              const int ncols_x, const int nrows_x,
+                              const int nrows_y, const float scale,
+                              dpct::queue_ptr stream) {
+    int nth = WARP_SIZE;
+    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    const sycl::range<3> block_dims(1, 1, nth);
+    const sycl::range<3> block_nums(1, 1, nrows_x);
+    /*
+    DPCT1049:61: The work-group size passed to the SYCL kernel may exceed the
+    limit. To get the device limit, query info::device::max_work_group_size.
+    Adjust the work-group size if needed.
+    */
+    stream->submit([&](sycl::handler &cgh) {
+        /*
+        DPCT1101:125: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
+        replaced with a value. Modify the code to use the original expression,
+        provided in comments, if it is correct.
+        */
+        sycl::local_accessor<float, 1> buf_acc_ct1(
+            sycl::range<1>(32 /*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
+
+        cgh.parallel_for(
+            sycl::nd_range<3>(block_nums * block_dims, block_dims),
+            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
+                soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
+                             buf_acc_ct1.get_pointer());
+            });
+    });
+}
+
+static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH,
+                                int OW, int OH, int KW, int KH, int IC,
+                                int offset_delta, int s0, int s1, int p0,
+                                int p1, int d0, int d1,
+                                dpct::queue_ptr stream) {
+    const int parallel_elements = OW * KW * KH;
+    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
+    sycl::range<3> block_nums(IC, OH, num_blocks);
+    {
+        dpct::has_capability_or_fail(stream->get_device(),
+                                     {sycl::aspect::fp16});
+        stream->parallel_for(
+            sycl::nd_range<3>(block_nums *
+                                  sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH,
+                               parallel_elements, (IC * KH * KW), s0, s1, p0,
+                               p1, d0, d1, item_ct1);
+            });
+    }
+}
+
+// buffer pool for cuda
+#define MAX_CUDA_BUFFERS 256
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cuda_buffer {
+    void * ptr = nullptr;
+    size_t size = 0;
+};
+
+static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+#ifdef DEBUG_CUDA_MALLOC
+    int nnz = 0;
+    size_t max_size = 0, tot_size = 0;
+#endif
+    size_t best_diff = 1ull << 36;
+    int ibest = -1;
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+        if (b.ptr != nullptr) {
+#ifdef DEBUG_CUDA_MALLOC
+            ++nnz;
+            tot_size += b.size;
+            if (b.size > max_size) max_size = b.size;
+#endif
+            if (b.size >= size) {
+                size_t diff = b.size - size;
+                if (diff < best_diff) {
+                    best_diff = diff;
+                    ibest = i;
+                    if (!best_diff) {
+                        void * ptr = b.ptr;
+                        *actual_size = b.size;
+                        b.ptr = nullptr;
+                        b.size = 0;
+                        return ptr;
+                    }
+                }
+            }
+        }
+    }
+    if (ibest >= 0) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
+        void * ptr = b.ptr;
+        *actual_size = b.size;
+        b.ptr = nullptr;
+        b.size = 0;
+        return ptr;
+    }
+#ifdef DEBUG_CUDA_MALLOC
+    fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
+            (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
+#endif
+    void * ptr;
+    size_t look_ahead_size = (size_t) (1.05 * size);
+    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(
+                             look_ahead_size, dpct::get_in_order_queue())));
+    *actual_size = look_ahead_size;
+    return ptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_pool_free(void *ptr, size_t size) try {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+        if (b.ptr == nullptr) {
+            b.ptr = ptr;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static bool g_cublas_loaded = false;
+
+bool ggml_cublas_loaded(void) {
+    return g_cublas_loaded;
+}
+
+void print_devices(int device_count){
+    for (int id = 0; id < device_count; ++id) {
+        dpct::device_info prop;
+        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(id))));
+
+        fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id,
+                prop.get_name(), prop.get_major_version(),
+                prop.get_minor_version());
+    }
+
+}
+
+void ggml_init_cublas() try {
+    static bool initialized = false;
+
+    if (!initialized) {
+
+#ifdef __HIP_PLATFORM_AMD__
+        // Workaround for a rocBLAS bug when using multiple graphics cards:
+        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
+        rocblas_initialize();
+        CUDA_CHECK(cudaDeviceSynchronize());
+#endif
+
+        g_device_count = dpct::dev_mgr::instance().device_count();
+        if (DPCT_CHECK_ERROR(g_device_count != 0)) {
+            initialized = true;
+            g_cublas_loaded = false;
+            return;
+        }
+
+        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
+        int64_t total_vram = 0;
+#if defined(GGML_CUDA_FORCE_MMQ)
+        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
+#else
+        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
+#endif
+#if defined(CUDA_USE_TENSOR_CORES)
+        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
+#else
+        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
+#endif
+        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
+        print_devices(g_device_count);
+
+        char * user_device_string = getenv("GGML_SYCL_DEVICE");
+        int user_device_number = -1;
+
+        unsigned n;
+        if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < g_device_count) {
+            user_device_number = (int)n;
+        } else {
+            user_device_number=0;
+        }
+
+        //zjy hardcode, force set to 1 device
+        int id = 0;
+        g_tensor_split[id] = total_vram;
+        dpct::device_info prop;
+        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(user_device_number))));
+
+        total_vram += prop.get_global_mem_size();
+        g_compute_capabilities[id] =
+                100 * prop.get_major_version() + 10 * prop.get_minor_version();
+
+        g_device_count = 1;
+        for (int id = 0; id < g_device_count; ++id) {
+            g_tensor_split[id] /= total_vram;
+        }
+
+        for (int id = 0; id < g_device_count; ++id) {
+            CUDA_CHECK(ggml_cuda_set_device(user_device_number));
+
+            // create cuda streams
+            for (int is = 0; is < MAX_STREAMS; ++is) {
+                /*
+                DPCT1025:105: The SYCL queue is created ignoring the flag and
+                priority options.
+                */
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    g_cudaStreams[id][is] =
+                        dpct::get_current_device().create_queue()));
+            }
+
+            // create cublas handle
+            CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
+                                              &dpct::get_in_order_queue()));
+            /*
+            DPCT1027:107: The call to cublasSetMathMode was replaced with 0
+            because this call is redundant in SYCL.
+            */
+            CUBLAS_CHECK(0);
+        }
+
+        // configure logging to stdout
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+
+
+        ggml_cuda_set_device(user_device_number);
+        fprintf(stderr, "  set Device %d\n", user_device_number);
+
+        initialized = true;
+        g_cublas_loaded = true;
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_set_tensor_split(const float * tensor_split) {
+    if (tensor_split == nullptr) {
+        return;
+    }
+    bool all_zero = true;
+    for (int i = 0; i < g_device_count; ++i) {
+        if (tensor_split[i] != 0.0f) {
+            all_zero = false;
+            break;
+        }
+    }
+    if (all_zero) {
+        return;
+    }
+    float split_sum = 0.0f;
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] = split_sum;
+        split_sum += tensor_split[i];
+    }
+    for (int i = 0; i < g_device_count; ++i) {
+        g_tensor_split[i] /= split_sum;
+    }
+}
+
+void *ggml_cuda_host_malloc(size_t size) try {
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    dpct::err0 err = DPCT_CHECK_ERROR(
+        ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
+    /*
+    DPCT1000:109: Error handling if-stmt was detected but could not be
+    rewritten.
+    */
+    if (err != 0) {
+        // The allocation error can be bypassed. A null ptr will assigned out of this function.
+        // This can fixed the OOM error in WSL.
+        /*
+        DPCT1026:110: The call to cudaGetLastError was removed because this call
+        is redundant in SYCL.
+        */
+        /*
+        DPCT1001:108: The statement could not be removed.
+        */
+        fprintf(
+            stderr,
+            "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            /*
+            DPCT1009:111: SYCL uses exceptions to report errors and does not use
+            the error codes. The original code was commented out and a warning
+            string was inserted. You need to rewrite this code.
+            */
+            size / 1024.0 / 1024.0,
+            "cudaGetErrorString is not supported" /*cudaGetErrorString(err)*/);
+        return nullptr;
+    }
+
+    return ptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_host_free(void *ptr) try {
+    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
+                                          const struct ggml_tensor *src,
+                                          int64_t i3, int64_t i2,
+                                          int64_t i1_low, int64_t i1_high,
+                                          dpct::queue_ptr stream) try {
+
+    dpct::memcpy_direction kind;
+    char * src_ptr;
+    if (src->backend == GGML_BACKEND_CPU) {
+        kind = dpct::host_to_device;
+        src_ptr = (char *) src->data;
+    } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
+        GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
+        kind = dpct::device_to_device;
+        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
+        int id;
+        id = dpct::dev_mgr::instance().current_device_id();
+        // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+        src_ptr = (char *) extra->data_device[id];
+    } else {
+        GGML_ASSERT(false);
+    }
+    char * dst_ptr = (char *) dst;
+
+    const int64_t ne0 = src->ne[0];
+    const int64_t nb0 = src->nb[0];
+    const int64_t nb1 = src->nb[1];
+    const int64_t nb2 = src->nb[2];
+    const int64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const int64_t ts = ggml_type_size(type);
+    const int64_t bs = ggml_blck_size(type);
+    int64_t i1_diff = i1_high - i1_low;
+
+    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        return DPCT_CHECK_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
+    } else if (nb0 == ts) {
+        return DPCT_CHECK_ERROR(
+            dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1,
+                                    ts * ne0 / bs, i1_diff, kind, *stream));
+    } else {
+        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
+            const void * rx = (const void *) ((const char *) x + i1*nb1);
+            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
+            // pretend the row is a matrix with cols=1
+            dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
+                rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
+            /*
+            DPCT1001:112: The statement could not be removed.
+            */
+            /*
+            DPCT1000:113: Error handling if-stmt was detected but could not be
+            rewritten.
+            */
+            if (r != 0) return r;
+        }
+        return 0;
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_op_get_rows(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_d, const float *src1_d,
+                                  float *dst_d, const dpct::queue_ptr &stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_I32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
+    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
+    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
+
+    const int32_t * src1_i32 = (const int32_t *) src1_d;
+
+    switch (src0->type) {
+        case GGML_TYPE_F16:
+            get_rows_cuda_float(src0, src1, dst, (const sycl::half *)src0_d,
+                                src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_F32:
+            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_0:
+            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            break;
+        default:
+            // TODO: k-quants
+            GGML_ASSERT(false);
+            break;
+    }
+}
+
+template <class op>
+inline void ggml_cuda_op_bin_bcast(const ggml_tensor *src0,
+                                   const ggml_tensor *src1, ggml_tensor *dst,
+                                   const float *src0_dd, const float *src1_dd,
+                                   float *dst_dd,
+                                   const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
+        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
+             (sycl::half *)dst_dd, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
+        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
+             main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
+            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ASSERT(false);
+    }
+}
+
+static void ggml_cuda_op_repeat(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_d, const float *src1_d,
+                                float *dst_d,
+                                const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
+
+    (void) src1;
+    (void) src1_d;
+}
+
+inline void ggml_cuda_op_add(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_cuda_op_acc(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
+
+    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
+    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
+    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
+    int offset = dst->op_params[3] / 4; // offset in bytes
+
+    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
+
+    (void) dst;
+}
+
+inline void ggml_cuda_op_mul(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_cuda_op_div(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+}
+
+inline void ggml_cuda_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_silu(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_gelu_quick(const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_relu(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_leaky_relu(const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float negative_slope;
+    memcpy(&negative_slope, dst->op_params, sizeof(float));
+
+    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_norm(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_group_norm(const ggml_tensor *src0,
+                                    const ggml_tensor *src1, ggml_tensor *dst,
+                                    const float *src0_dd, const float *src1_dd,
+                                    float *dst_dd,
+                                    const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    int num_groups = dst->op_params[0];
+    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
+    group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_concat(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_dd, const float *src1_dd,
+                                float *dst_dd,
+                                const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+
+    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
+        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
+    }
+
+    (void) src1;
+    (void) dst;
+}
+
+inline void ggml_cuda_op_upscale(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    const int scale_factor = dst->op_params[0];
+
+    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
+
+    (void) src1;
+    (void) dst;
+}
+
+inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
+                             ggml_tensor *dst, const float *src0_dd,
+                             const float *src1_dd, float *dst_dd,
+                             const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
+
+    pad_f32_cuda(src0_dd, dst_dd,
+        src0->ne[0], src0->ne[1], src0->ne[2],
+        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
+
+    (void) src1;
+    (void) dst;
+}
+
+inline void ggml_cuda_op_rms_norm(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int ne00 = src0->ne[0];
+    const int nrows = ggml_nrows(src0);
+
+    float eps;
+    memcpy(&eps, dst->op_params, sizeof(float));
+
+    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_mul_mat_q(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) try {
+
+    const int64_t ne00 = src0->ne[0];
+
+    const int64_t ne10 = src1->ne[0];
+    GGML_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
+    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static int64_t get_row_rounding(ggml_type type) {
+    int64_t min_compute_capability = INT_MAX;
+    int64_t max_compute_capability = INT_MIN;
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            if (min_compute_capability > g_compute_capabilities[id]) {
+                min_compute_capability = g_compute_capabilities[id];
+            }
+            if (max_compute_capability < g_compute_capabilities[id]) {
+                max_compute_capability = g_compute_capabilities[id];
+            }
+        }
+    }
+
+#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            return 1;
+        case GGML_TYPE_Q2_K:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
+        case GGML_TYPE_Q3_K:
+            return min_compute_capability < CC_RDNA2 ? 128 : 64;
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
+        default:
+            GGML_ASSERT(false);
+    }
+#else
+    switch(type) {
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+            return 64;
+        case GGML_TYPE_F16:
+        case GGML_TYPE_F32:
+            return 1;
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+        case GGML_TYPE_Q6_K:
+            return 64;
+        default:
+            GGML_ASSERT(false);
+    }
+#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+}
+
+inline void ggml_cuda_op_mul_mat_vec_q(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) {
+
+    GGML_ASSERT(ggml_nrows(src1) == 1);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddf_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+inline void ggml_cuda_op_dequantize_mul_mat_vec(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
+#ifdef GGML_CUDA_F16
+    size_t ash;
+    dfloat * src1_dfloat = nullptr; // dfloat == half
+
+    bool src1_convert_f16 =
+        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
+        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
+        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
+
+    if (src1_convert_f16) {
+        src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
+        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
+                                ne00, 1, sizeof(float), 0, 0,
+                                ne00, 1, sizeof(half),  0, 0, stream);
+    }
+#else
+    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
+#endif // GGML_CUDA_F16
+
+    switch (src0->type) {
+        case GGML_TYPE_Q4_0:
+            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_1:
+            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_0:
+            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_1:
+            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q8_0:
+            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q2_K:
+            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q3_K:
+            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q4_K:
+            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q5_K:
+            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_Q6_K:
+            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            break;
+        case GGML_TYPE_F16:
+            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            break;
+        default:
+            GGML_ASSERT(false);
+            break;
+    }
+
+#ifdef GGML_CUDA_F16
+    if (src1_convert_f16) {
+        ggml_cuda_pool_free(src1_dfloat, ash);
+    }
+#endif // GGML_CUDA_F16
+
+    (void) src1;
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_ncols;
+    (void) src1_padded_row_size;
+}
+
+inline void ggml_cuda_op_mul_mat_cublas(
+    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
+    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
+    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
+    const int64_t src1_ncols, const int64_t src1_padded_row_size,
+    const dpct::queue_ptr &stream) try {
+
+    GGML_ASSERT(src0_dd_i  != nullptr);
+    GGML_ASSERT(src1_ddf_i != nullptr);
+    GGML_ASSERT(dst_dd_i   != nullptr);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+
+    const int64_t row_diff = row_high - row_low;
+
+    int id;
+    id = dpct::dev_mgr::instance().current_device_id();
+    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // ldc == nrows of the matrix that cuBLAS writes into
+    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+
+    const int compute_capability = g_compute_capabilities[id];
+
+    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
+        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
+        sycl::half *src0_as_f16 = nullptr;
+        size_t src0_as = 0;
+        if (src0->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = row_diff*ne00;
+            src0_as_f16 = (sycl::half *)ggml_cuda_pool_malloc(
+                ne * sizeof(sycl::half), &src0_as);
+            to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
+        }
+        const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
+                                         ? (const sycl::half *)src0_dd_i
+                                         : src0_as_f16;
+
+        sycl::half *src1_as_f16 = nullptr;
+        size_t src1_as = 0;
+        if (src1->type != GGML_TYPE_F16) {
+            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+            GGML_ASSERT(to_fp16_cuda != nullptr);
+            size_t ne = src1_ncols*ne10;
+            src1_as_f16 = (sycl::half *)ggml_cuda_pool_malloc(
+                ne * sizeof(sycl::half), &src1_as);
+            to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
+        }
+        const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
+                                         ? (const sycl::half *)src1_ddf_i
+                                         : src1_as_f16;
+        size_t dst_as = 0;
+        sycl::half *dst_f16 = (sycl::half *)ggml_cuda_pool_malloc(
+            row_diff * src1_ncols * sizeof(sycl::half), &dst_as);
+
+        const sycl::half alpha_f16 = 1.0f;
+        const sycl::half beta_f16 = 0.0f;
+
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
+            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+            &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
+            src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, dst_f16,
+            dpct::library_data_t::real_half, ldc,
+            dpct::library_data_t::real_half)));
+
+        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+        to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
+
+        ggml_cuda_pool_free(dst_f16, dst_as);
+
+        if (src0_as != 0) {
+            ggml_cuda_pool_free(src0_as_f16, src0_as);
+        }
+
+        if (src1_as != 0) {
+            ggml_cuda_pool_free(src1_as_f16, src1_as);
+        }
+    }
+    else {
+        float * src0_ddq_as_f32 = nullptr;
+        size_t src0_as = 0;
+
+        if (src0->type != GGML_TYPE_F32) {
+            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
+            GGML_ASSERT(to_fp32_cuda != nullptr);
+            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
+            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
+        }
+        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
+
+        const float alpha = 1.0f;
+        const float beta = 0.0f;
+
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
+            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, alpha,
+            src0_ddf_i, ne00, src1_ddf_i, ne10, beta, dst_dd_i, ldc)));
+
+        if (src0_as != 0) {
+            ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
+        }
+    }
+
+    (void) dst;
+    (void) src1_ddq_i;
+    (void) src1_padded_row_size;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
+                              ggml_tensor *dst, const float *src0_dd,
+                              const float *src1_dd, float *dst_dd,
+                              const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
+    GGML_ASSERT(src0->type == dst->type);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne2 = dst->ne[2];
+    const int64_t nrows = ggml_nrows(src0);
+
+    //const int n_past      = ((int32_t *) dst->op_params)[0];
+    const int n_dims      = ((int32_t *) dst->op_params)[1];
+    const int mode        = ((int32_t *) dst->op_params)[2];
+    const int n_ctx       = ((int32_t *) dst->op_params)[3];
+    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
+
+    // RoPE alteration for extended context
+    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
+    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
+    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
+    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
+    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
+    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
+    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
+
+    const int32_t * pos = nullptr;
+    if ((mode & 1) == 0) {
+        GGML_ASSERT(src1->type == GGML_TYPE_I32);
+        GGML_ASSERT(src1->ne[0] == ne2);
+        pos = (const int32_t *) src1_dd;
+    }
+
+    const bool is_neox = mode & 2;
+    const bool is_glm  = mode & 4;
+
+    rope_corr_dims corr_dims;
+    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
+
+    // compute
+    if (is_glm) {
+        GGML_ASSERT(false);
+        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
+    } else if (is_neox) {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_neox_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_neox_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
+                           ne00, n_dims, nrows, pos, freq_scale, ne01,
+                           freq_base, ext_factor, attn_factor, corr_dims,
+                           main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
+    } else {
+        if (src0->type == GGML_TYPE_F32) {
+            rope_cuda(
+                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                attn_factor, corr_dims, main_stream
+            );
+        } else if (src0->type == GGML_TYPE_F16) {
+            rope_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
+                      nrows, pos, freq_scale, ne01, freq_base, ext_factor,
+                      attn_factor, corr_dims, main_stream);
+        } else {
+            GGML_ASSERT(false);
+        }
+    }
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t nrows = ggml_nrows(src0);
+
+    //const int n_past = ((int32_t *) dst->op_params)[0];
+    const int n_head = ((int32_t *) dst->op_params)[1];
+    float max_bias;
+    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
+
+    //GGML_ASSERT(ne01 + n_past == ne00);
+    GGML_ASSERT(n_head == ne02);
+
+    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
+
+    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
+    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
+
+    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
+
+    (void) src1;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_im2col(const ggml_tensor *src0,
+                                const ggml_tensor *src1, ggml_tensor *dst,
+                                const float *src0_dd, const float *src1_dd,
+                                float *dst_dd,
+                                const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F16);
+
+    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
+    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
+    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
+    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
+    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
+    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
+
+    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
+
+    const int64_t IC = src1->ne[is_2D ? 2 : 1];
+    const int64_t IH = is_2D ? src1->ne[1] : 1;
+    const int64_t IW =         src1->ne[0];
+
+    const int64_t KH = is_2D ? src0->ne[1] : 1;
+    const int64_t KW =         src0->ne[0];
+
+    const int64_t OH = is_2D ? dst->ne[2] : 1;
+    const int64_t OW =         dst->ne[1];
+
+    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
+
+    im2col_f32_f16_cuda(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
+                        IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
+
+    (void) src0;
+    (void) src0_dd;
+}
+
+inline void ggml_cuda_op_sum_rows(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_argsort(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const float *src0_dd, const float *src1_dd,
+                                 float *dst_dd,
+                                 const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_I32);
+
+    const int64_t ncols = src0->ne[0];
+    const int64_t nrows = ggml_nrows(src0);
+
+    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
+
+    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_diag_mask_inf(const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst, const float *src0_dd,
+                                       const float *src1_dd, float *dst_dd,
+                                       const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int nrows0 = ggml_nrows(src0);
+
+    const int n_past = ((int32_t *) dst->op_params)[0];
+
+    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+inline void ggml_cuda_op_soft_max(const ggml_tensor *src0,
+                                  const ggml_tensor *src1, ggml_tensor *dst,
+                                  const float *src0_dd, const float *src1_dd,
+                                  float *dst_dd,
+                                  const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t nrows_x = ggml_nrows(src0);
+    const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
+
+    float scale = 1.0f;
+    memcpy(&scale, dst->op_params, sizeof(float));
+
+    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
+
+    (void) dst;
+}
+
+inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const dpct::queue_ptr &main_stream) try {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float scale;
+    // HACK: support for ggml backend interface
+    if (src1->backend == GGML_BACKEND_CPU) {
+        scale = ((float *) src1->data)[0];
+    } else {
+        // TODO: pass pointer to kernel instead of copying to host
+        CUDA_CHECK(
+            DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                                 .memcpy(&scale, src1->data, sizeof(float))
+                                 .wait()));
+    }
+
+    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
+    /*
+    DPCT1010:114: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    CUDA_CHECK(0);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
+                               ggml_tensor *dst, const float *src0_dd,
+                               const float *src1_dd, float *dst_dd,
+                               const dpct::queue_ptr &main_stream) {
+
+    GGML_ASSERT(src0->type == GGML_TYPE_F32);
+    GGML_ASSERT( dst->type == GGML_TYPE_F32);
+
+    float min;
+    float max;
+    memcpy(&min, dst->op_params, sizeof(float));
+    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
+
+    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
+    /*
+    DPCT1010:115: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    CUDA_CHECK(0);
+
+    (void) src1;
+    (void) dst;
+    (void) src1_dd;
+}
+
+static void ggml_cuda_op_flatten(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 const ggml_cuda_op_flatten_t op) try {
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const bool use_src1 = src1 != nullptr;
+    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
+
+    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
+
+    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
+    const bool  dst_on_device =              dst->backend == GGML_BACKEND_GPU;
+
+    const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
+
+    // dd = data device
+    float * src0_ddf = nullptr;
+    float * src1_ddf = nullptr;
+    float *  dst_ddf = nullptr;
+
+    // as = actual size
+    size_t src0_asf = 0;
+    size_t src1_asf = 0;
+    size_t  dst_asf = 0;
+
+    ggml_cuda_set_device(g_main_device);
+    const dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    if (src0_on_device) {
+        src0_ddf = (float *) src0_extra->data_device[g_main_device];
+    } else {
+        src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
+        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
+    }
+
+    if (use_src1 && !src1_stays_on_host) {
+        if (src1_on_device) {
+            src1_ddf = (float *) src1_extra->data_device[g_main_device];
+        } else {
+            src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
+            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
+        }
+    }
+    if (dst_on_device) {
+        dst_ddf = (float *) dst_extra->data_device[g_main_device];
+    } else {
+        dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
+    }
+
+    // do the computation
+    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
+    /*
+    DPCT1010:116: SYCL uses exceptions to report errors and does not use the
+    error codes. The call was replaced with 0. You need to rewrite this code.
+    */
+    CUDA_CHECK(0);
+
+    // copy dst to host if necessary
+    if (!dst_on_device) {
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
+    }
+
+    if (src0_asf > 0) {
+        ggml_cuda_pool_free(src0_ddf, src0_asf);
+    }
+    if (src1_asf > 0) {
+        ggml_cuda_pool_free(src1_ddf, src1_asf);
+    }
+    if (dst_asf > 0) {
+        ggml_cuda_pool_free(dst_ddf, dst_asf);
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            dpct::get_current_device().queues_wait_and_throw()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_set_peer_access(const int n_tokens) {
+    static bool peer_access_enabled = false;
+
+    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
+
+    if (peer_access_enabled == enable_peer_access) {
+        return;
+    }
+
+#ifdef NDEBUG
+    for (int id = 0; id < g_device_count; ++id) {
+        CUDA_CHECK(ggml_cuda_set_device(id));
+
+        for (int id_other = 0; id_other < g_device_count; ++id_other) {
+            if (id == id_other) {
+                continue;
+            }
+            if (id != g_main_device && id_other != g_main_device) {
+                continue;
+            }
+
+            int can_access_peer=1;
+            // CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            if (can_access_peer) {
+                if (enable_peer_access) {
+                    // CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+                } else {
+                    // CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
+                }
+            }
+        }
+    }
+#endif // NDEBUG
+
+    peer_access_enabled = enable_peer_access;
+}
+
+static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
+                                 const ggml_tensor *src1, ggml_tensor *dst,
+                                 ggml_cuda_op_mul_mat_t op,
+                                 const bool convert_src1_to_q8_1) try {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+    const int64_t nrows0 = ggml_nrows(src0);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+    const int64_t nrows1 = ggml_nrows(src1);
+
+    GGML_ASSERT(ne03 == ne13);
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    const int nb2 = dst->nb[2];
+    const int nb3 = dst->nb[3];
+
+    ggml_cuda_set_peer_access(ne11);
+
+    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
+
+    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
+
+    const int64_t i02_divisor = ne12 / ne02;
+
+    const size_t src0_ts = ggml_type_size(src0->type);
+    const size_t src0_bs = ggml_blck_size(src0->type);
+    const size_t q8_1_ts = sizeof(block_q8_1);
+    const size_t q8_1_bs = QK8_1;
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
+
+    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
+    const bool src0_is_contiguous = ggml_is_contiguous(src0);
+    const bool src1_is_contiguous = ggml_is_contiguous(src1);
+
+    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
+
+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+    GGML_ASSERT(!(split && ne02 > 1));
+    GGML_ASSERT(!(split && ne03 > 1));
+    GGML_ASSERT(!(split && ne02 < ne12));
+
+    // dd = data device
+    char  *  src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+    float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
+    char  * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
+    float *   dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+
+    // as = actual size
+    size_t  src0_as[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t   dst_as[GGML_CUDA_MAX_DEVICES] = {0};
+
+    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
+    int64_t row_high[GGML_CUDA_MAX_DEVICES];
+
+    int used_devices = 0;
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        // by default, use all rows
+        row_low[id]  = 0;
+        row_high[id] = ne01;
+
+        // for multi GPU, get the row boundaries from tensor split
+        // and round to mul_mat_q tile sizes
+        if (split) {
+            const int64_t rounding = get_row_rounding(src0->type);
+
+            if (id != 0) {
+                row_low[id]  = ne01*g_tensor_split[id];
+                row_low[id] -= row_low[id] % rounding;
+            }
+
+            if (id != g_device_count - 1) {
+                row_high[id]  = ne01*g_tensor_split[id + 1];
+                row_high[id] -= row_high[id] % rounding;
+            }
+        }
+    }
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
+
+        used_devices++;
+
+        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+
+        ggml_cuda_set_device(id);
+        const dpct::queue_ptr stream = g_cudaStreams[id][0];
+
+        if (src0_on_device && src0_is_contiguous) {
+            src0_dd[id] = (char *) src0_extra->data_device[id];
+        } else {
+            // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
+            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
+        }
+
+        if (src1_on_device && src1_is_contiguous) {
+            src1_ddf[id] = (float *) src1_extra->data_device[id];
+        } else {
+            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
+        }
+
+        if (convert_src1_to_q8_1) {
+            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
+
+            if (src1_on_device && src1_is_contiguous) {
+                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
+                /*
+                DPCT1010:117: SYCL uses exceptions to report errors and does not
+                use the error codes. The call was replaced with 0. You need to
+                rewrite this code.
+                */
+                CUDA_CHECK(0);
+            }
+        }
+
+        if (dst_on_device) {
+            dst_dd[id] = (float *) dst_extra->data_device[id];
+        } else {
+            const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
+            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
+        }
+    }
+
+    // if multiple devices are used they need to wait for the main device
+    // here an event is recorded that signals that the main device has finished calculating the input data
+    if (split && used_devices > 1) {
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        /*
+        DPCT1012:118: Detected kernel execution time measurement pattern and
+        generated an initial code for time measurements in SYCL. You can change
+        the way time is measured depending on your goals.
+        */
+        /*
+        DPCT1024:119: The original code returned the error code that was further
+        consumed by the program logic. This original code was replaced with 0.
+        You may need to rewrite the program logic consuming the error code.
+        */
+        src0_extra->events_ct1_0[g_main_device] =
+            std::chrono::steady_clock::now();
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            *src0_extra->events[g_main_device][0] =
+                g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier()));
+    }
+
+    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
+    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
+        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
+        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
+
+        for (int64_t id = 0; id < g_device_count; ++id) {
+            if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+                continue;
+            }
+
+            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
+            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+            const int64_t row_diff = row_high[id] - row_low[id];
+
+            ggml_cuda_set_device(id);
+            const dpct::queue_ptr stream = g_cudaStreams[id][is];
+
+            // wait for main GPU data if necessary
+            if (split && (id != g_main_device || is != 0)) {
+                // CUDA_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
+                //     {*src0_extra->events[g_main_device][0]})));
+
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    *src0_extra->events[g_main_device][0] = stream->ext_oneapi_submit_barrier()));
+
+            }
+
+            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
+                const int64_t i03 = i0 / ne12;
+                const int64_t i02 = i0 % ne12;
+
+                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
+
+                // for split tensors the data begins at i0 == i0_offset_low
+                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
+                float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
+                char  * src1_ddq_i = src1_ddq[id] +  src1_ddq_i_offset;
+                float *   dst_dd_i =   dst_dd[id] + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
+
+                // the main device memory buffer can be on VRAM scratch, with space for all partial results
+                // in that case an offset on dst_ddf_i is needed
+                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
+                    dst_dd_i += row_low[id]; // offset is 0 if no tensor split
+                }
+
+                // copy src0, src1 to device if necessary
+                if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
+                    if (id != g_main_device) {
+                        if (convert_src1_to_q8_1) {
+                            char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
+                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                                src1_ddq_i, src1_ddq_i_source,
+                                src1_ncols * src1_padded_col_size * q8_1_ts /
+                                    q8_1_bs)));
+                        } else {
+                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
+                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
+                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                                src1_ddf_i, src1_ddf_i_source,
+                                src1_ncols * ne10 * sizeof(float))));
+                        }
+                    }
+                } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
+                                   src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
+                } else {
+                    GGML_ASSERT(false);
+                }
+
+                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
+                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    /*
+                    DPCT1010:120: SYCL uses exceptions to report errors and does
+                    not use the error codes. The call was replaced with 0. You
+                    need to rewrite this code.
+                    */
+                    CUDA_CHECK(0);
+                }
+
+                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
+                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
+                }
+
+                // do the computation
+                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
+                   row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
+                /*
+                DPCT1010:121: SYCL uses exceptions to report errors and does not
+                use the error codes. The call was replaced with 0. You need to
+                rewrite this code.
+                */
+                CUDA_CHECK(0);
+
+                // copy dst to host or other device if necessary
+                if (!dst_on_device) {
+                    void * dst_off_device;
+                    dpct::memcpy_direction kind;
+                    if (dst->backend == GGML_BACKEND_CPU) {
+                        dst_off_device = dst->data;
+                        kind = dpct::device_to_host;
+                    } else if (dst->backend == GGML_BACKEND_GPU) {
+                        dst_off_device = dst_extra->data_device[g_main_device];
+                        kind = dpct::device_to_device;
+                    } else {
+                        GGML_ASSERT(false);
+                    }
+                    if (split) {
+                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
+                        // dst is NOT transposed.
+                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
+                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
+                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0 + row_low[id];
+                        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
+                            dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
+                            row_diff * sizeof(float), row_diff * sizeof(float),
+                            src1_ncols, kind, *stream)));
+                    } else {
+                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
+                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
+                        dhf_dst_i += src1_col_0*ne0;
+                        CUDA_CHECK(DPCT_CHECK_ERROR(
+                            stream->memcpy(dhf_dst_i, dst_dd_i,
+                                           src1_ncols * ne0 * sizeof(float))));
+                    }
+                }
+
+                // add event for the main device to wait on until other device is done
+                if (split && (id != g_main_device || is != 0)) {
+                    /*
+                    DPCT1012:122: Detected kernel execution time measurement
+                    pattern and generated an initial code for time measurements
+                    in SYCL. You can change the way time is measured depending
+                    on your goals.
+                    */
+                    /*
+                    DPCT1024:123: The original code returned the error code that
+                    was further consumed by the program logic. This original
+                    code was replaced with 0. You may need to rewrite the
+                    program logic consuming the error code.
+                    */
+                    src0_extra->events_ct1_is[id] =
+                        std::chrono::steady_clock::now();
+                    CUDA_CHECK(DPCT_CHECK_ERROR(
+                        *src0_extra->events[id][is] =
+                            stream->ext_oneapi_submit_barrier()));
+                }
+            }
+        }
+    }
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            continue;
+        }
+        CUDA_CHECK(ggml_cuda_set_device(id));
+
+        // free buffers again when done
+        if (src0_as[id] > 0) {
+            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
+        }
+        if (src1_asf[id] > 0) {
+            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+        }
+        if (src1_asq[id] > 0) {
+            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
+        }
+        if (dst_as[id] > 0) {
+            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
+        }
+    }
+
+    // main device waits for all other devices to be finished
+    if (split && g_device_count > 1) {
+        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
+        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
+
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        for (int64_t id = 0; id < g_device_count; ++id) {
+            if (row_low[id] == row_high[id]) {
+                continue;
+            }
+            for (int64_t is = 0; is < is_max; ++is) {
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier(
+                        {*src0_extra->events[id][is]})));
+            }
+        }
+    }
+
+    if (dst->backend == GGML_BACKEND_CPU) {
+        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            dpct::get_current_device().queues_wait_and_throw()));
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
+}
+
+static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
+}
+
+static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
+}
+
+static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
+}
+
+static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
+}
+
+static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
+}
+
+static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
+}
+
+static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
+}
+
+static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
+}
+
+static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
+}
+
+static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
+}
+
+static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
+}
+
+static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
+}
+
+static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
+}
+
+static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
+}
+
+static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
+}
+
+static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
+}
+
+static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
+}
+
+static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
+}
+
+bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (!g_cublas_loaded) return false;
+
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+            src1->type == GGML_TYPE_F32 &&
+             dst->type == GGML_TYPE_F32 &&
+            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
+}
+
+static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
+                                       const ggml_tensor *src1,
+                                       ggml_tensor *dst) try {
+    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
+    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
+                                     const ggml_tensor *src1,
+                                     ggml_tensor *dst) try {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+    GGML_ASSERT(!ggml_is_permuted(src0));
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne12 = src1->ne[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    const int64_t row_stride_x = nb01 / sizeof(sycl::half);
+    const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
+
+    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
+                                   const sycl::half *src1_as_f16,
+                                   sycl::half *dst_f16, const void **ptrs_src,
+                                   void **ptrs_dst, int ne12, int ne13,
+                                   int ne23, int nb02, int nb03, int nb12,
+                                   int nb13, int nb2, int nb3, int r2, int r3,
+                                   const sycl::nd_item<3> &item_ct1) {
+    int i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
+              item_ct1.get_local_id(2);
+    int i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
+              item_ct1.get_local_id(1);
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int i03 = i13 / r3;
+    int i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)     dst_f16 + i12* nb2/2 + i13* nb3/2;
+}
+
+static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
+                                                 const ggml_tensor *src1,
+                                                 ggml_tensor *dst) try {
+    GGML_ASSERT(!ggml_is_transposed(src0));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+
+    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src0->type == GGML_TYPE_F16);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
+    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
+    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
+
+    const int64_t ne1 = ggml_nelements(src1);
+    const int64_t ne  = ggml_nelements(dst);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    CUBLAS_CHECK(
+        DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream));
+
+    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    void * src0_ddq = src0_extra->data_device[g_main_device];
+    sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    // convert src1 to fp16
+    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+    GGML_ASSERT(to_fp16_cuda != nullptr);
+
+    size_t src1_as = 0;
+    sycl::half *src1_as_f16 =
+        (sycl::half *)ggml_cuda_pool_malloc(ne1 * sizeof(sycl::half), &src1_as);
+    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
+
+    size_t dst_as = 0;
+    sycl::half *dst_f16 =
+        (sycl::half *)ggml_cuda_pool_malloc(ne * sizeof(sycl::half), &dst_as);
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    const sycl::half alpha_f16 = 1.0f;
+    const sycl::half beta_f16 = 0.0f;
+
+#if 0
+    // use cublasGemmEx
+    {
+        for (int i13 = 0; i13 < ne13; ++i13) {
+            for (int i12 = 0; i12 < ne12; ++i12) {
+                int i03 = i13 / r3;
+                int i02 = i12 / r2;
+
+                CUBLAS_CHECK(
+                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+                            ne01, ne11, ne10,
+                            &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F, nb01/sizeof(half),
+                                        (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
+                            &beta_f16,  (      char *)     dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
+                            CUBLAS_COMPUTE_16F,
+                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+            }
+        }
+    }
+#else
+    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
+        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
+        // use cublasGemmStridedBatchedEx
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
+            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, &alpha_f16,
+            (const char *)src0_as_f16, dpct::library_data_t::real_half,
+            nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
+            (const char *)src1_as_f16, dpct::library_data_t::real_half,
+            nb11 / sizeof(float), src1->nb[2] / sizeof(float), &beta_f16,
+            (char *)dst_f16, dpct::library_data_t::real_half, ne01,
+            dst->nb[2] / sizeof(float), ne12 * ne13,
+            dpct::library_data_t::real_half)));
+    } else {
+        // use cublasGemmBatchedEx
+        const int ne23 = ne12*ne13;
+
+        const void ** ptrs_src = nullptr;
+              void ** ptrs_dst = nullptr;
+
+        size_t ptrs_src_s = 0;
+        size_t ptrs_dst_s = 0;
+
+        ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
+        ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
+
+        sycl::range<3> block_dims(1, ne12, ne13);
+        /*
+        DPCT1049:62: The work-group size passed to the SYCL kernel may exceed
+        the limit. To get the device limit, query
+        info::device::max_work_group_size. Adjust the work-group size if needed.
+        */
+        {
+            dpct::has_capability_or_fail(main_stream->get_device(),
+                                         {sycl::aspect::fp16});
+            main_stream->submit([&](sycl::handler &cgh) {
+                int dst_nb_ct12 = dst->nb[2];
+                int dst_nb_ct13 = dst->nb[3];
+
+                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
+                                 [=](sycl::nd_item<3> item_ct1) {
+                                     k_compute_batched_ptrs(
+                                         src0_as_f16, src1_as_f16, dst_f16,
+                                         ptrs_src, ptrs_dst, ne12, ne13, ne23,
+                                         nb02, nb03, nb12, nb13, dst_nb_ct12,
+                                         dst_nb_ct13, r2, r3, item_ct1);
+                                 });
+            });
+        }
+        /*
+        DPCT1010:124: SYCL uses exceptions to report errors and does not use the
+        error codes. The call was replaced with 0. You need to rewrite this
+        code.
+        */
+        CUDA_CHECK(0);
+
+        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
+            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
+            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, &alpha_f16,
+            (const void **)(ptrs_src + 0 * ne23),
+            dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
+            (const void **)(ptrs_src + 1 * ne23),
+            dpct::library_data_t::real_half, nb11 / sizeof(float), &beta_f16,
+            (void **)(ptrs_dst + 0 * ne23), dpct::library_data_t::real_half,
+            ne01, ne23, dpct::library_data_t::real_half)));
+
+        if (ptrs_src_s != 0) {
+            ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
+        }
+        if (ptrs_dst_s != 0) {
+            ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
+        }
+    }
+#endif
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+
+    ggml_cuda_pool_free(src1_as_f16, src1_as);
+    ggml_cuda_pool_free(dst_f16, dst_as);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const bool all_on_device =
+        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
+        (src1->backend == GGML_BACKEND_GPU) &&
+        ( dst->backend == GGML_BACKEND_GPU);
+
+    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
+
+    int64_t min_compute_capability = INT_MAX;
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+            min_compute_capability = g_compute_capabilities[id];
+        }
+    }
+
+#ifdef CUDA_USE_TENSOR_CORES
+    const bool use_tensor_cores = true;
+#else
+    const bool use_tensor_cores = false;
+#endif
+
+    // debug helpers
+    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
+    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
+    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
+
+    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+        // KQ single-batch
+        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
+    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+        // KQV single-batch
+        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
+    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+        // KQ + KQV multi-batch
+        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
+    } else if (src0->type == GGML_TYPE_F32) {
+        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
+#ifdef GGML_CUDA_FORCE_DMMV
+            const bool use_mul_mat_vec_q = false;
+#else
+            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
+#endif // GGML_CUDA_FORCE_DMMV
+
+            if (use_mul_mat_vec_q) {
+                // NOTE: this kernel does not support ggml_nrows(src1) > 1
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
+            } else {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
+            }
+        } else {
+            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+
+            // when tensor cores are available, use them for large batch size
+            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
+            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
+                use_mul_mat_q = false;
+            }
+
+            if (use_mul_mat_q) {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
+            } else {
+                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+            }
+        }
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+#if 0
+template<typename ... Srcs>
+static __global__ void k_compute_batched_ptrs_id(
+        const void ** ptrs_src, void ** ptrs_dst,
+        int ne12, int ne13,
+        int ne23,
+        int nb02, int nb03,
+        int nb12, int nb13,
+        int nb2, int nb3,
+        int r2, int r3,
+        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
+        const half * src1_f16, half * dst_f16,
+        const int32_t * ids, const int id,
+        Srcs... src0s) {
+
+    int i = ids[id];
+
+    half * src0_f16;
+    const void * srcs_ar[] = { (const half *) src0s... };
+    if (src0_type == GGML_TYPE_F16) {
+        src0_f16 = (half *) srcs_ar[i];
+    } else {
+        src0_f16 = src0_as_f16;
+        if (threadIdx.x == 0 && threadIdx.y == 0) {
+            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
+            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
+        }
+    }
+
+    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
+    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
+
+    if (i13 >= ne13 || i12 >= ne12) {
+        return;
+    }
+
+    int i03 = i13 / r3;
+    int i02 = i12 / r2;
+
+    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
+    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
+    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
+}
+
+static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
+    const struct ggml_tensor * ids = dst->src[0];
+    const struct ggml_tensor * src1 = dst->src[1];
+    const struct ggml_tensor * src00 = dst->src[2];
+
+    const int id = dst->op_params[0];
+
+    GGML_ASSERT(!ggml_is_transposed(src00));
+    GGML_ASSERT(!ggml_is_transposed(src1));
+
+    GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
+    GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
+    const int64_t ne01 = src00->ne[1];
+    const int64_t ne02 = src00->ne[2];
+    const int64_t ne03 = src00->ne[3];
+
+    //const int64_t nb01 = src00->nb[1];
+    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
+    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    const int64_t ne12 = src1->ne[2];
+    const int64_t ne13 = src1->ne[3];
+
+    //const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
+    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
+
+    const int64_t ne1 = ggml_nelements(src1);
+    const int64_t ne  = ggml_nelements(dst);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+
+    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
+
+    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    //void * src0_ddq = src0_extra->data_device[g_main_device];
+    //half * src0_as_f16 = (half *) src0_ddq;
+
+    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+
+    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+
+    // convert src1 to fp16
+    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
+    GGML_ASSERT(to_fp16_cuda != nullptr);
+
+    size_t src1_as = 0;
+    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
+    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
+
+    size_t dst_as = 0;
+    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
+
+    GGML_ASSERT(ne12 % ne02 == 0);
+    GGML_ASSERT(ne13 % ne03 == 0);
+
+    // broadcast factors
+    const int64_t r2 = ne12/ne02;
+    const int64_t r3 = ne13/ne03;
+
+    const half alpha_f16 = 1.0f;
+    const half beta_f16  = 0.0f;
+
+    // use cublasGemmBatchedEx
+    const int ne23 = ne12*ne13;
+
+    const void ** ptrs_src = nullptr;
+          void ** ptrs_dst = nullptr;
+
+    size_t ptrs_src_s = 0;
+    size_t ptrs_dst_s = 0;
+
+    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
+    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
+
+    int64_t src0_ne = ggml_nelements(src00);
+    half * src0_as_f16 = nullptr;
+    size_t src0_as = 0;
+    if (src00->type != GGML_TYPE_F16) {
+        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
+    }
+
+    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
+    dim3 block_dims(ne13, ne12);
+    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
+            ptrs_src, ptrs_dst,
+            ne12, ne13,
+            ne23,
+            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
+            nb12, nb13,
+            dst->nb[2], dst->nb[3],
+            r2, r3,
+            src00->type, src0_as_f16, src0_ne,
+            src1_as_f16, dst_f16,
+            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
+            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
+            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
+    );
+    CUDA_CHECK(cudaGetLastError());
+
+    CUBLAS_CHECK(
+    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+            ne01, ne11, ne10,
+            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
+                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
+            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
+            ne23,
+            CUBLAS_COMPUTE_16F,
+            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+
+    if (src0_as != 0) {
+        ggml_cuda_pool_free(src0_as_f16, src0_as);
+    }
+    if (ptrs_src_s != 0) {
+        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
+    }
+    if (ptrs_dst_s != 0) {
+        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
+    }
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
+    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+
+    ggml_cuda_pool_free(src1_as_f16, src1_as);
+    ggml_cuda_pool_free(dst_f16, dst_as);
+}
+#endif
+
+static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
+                                 const ggml_tensor *src1,
+                                 ggml_tensor *dst) try {
+#if 0
+    ggml_cuda_mul_mat_id_cublas(dst);
+    // TODO: mmq/mmv support
+#endif
+
+    GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
+
+    const struct ggml_tensor * ids = src0;
+    const int32_t id = ((int32_t *) dst->op_params)[0];
+    const int32_t n_as = ((int32_t *) dst->op_params)[1];
+
+    std::vector<char> ids_host(ggml_nbytes(ids));
+
+    if (ids->backend == GGML_BACKEND_GPU) {
+        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
+        CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[g_main_device][0]->memcpy(
+            ids_host.data(), ids_dev, ggml_nbytes(ids))));
+        CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[g_main_device][0]->wait()));
+    } else {
+        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
+    }
+
+    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
+    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
+
+    ggml_tensor_extra_gpu src1_row_extra;
+    ggml_tensor_extra_gpu dst_row_extra;
+
+    ggml_tensor src1_row = *src1;
+    ggml_tensor dst_row = *dst;
+
+    src1_row.ne[1] = 1;
+    dst_row.ne[1] = 1;
+
+    src1_row.nb[2] = src1_row.nb[1];
+    dst_row.nb[2] = dst_row.nb[1];
+
+    src1_row.nb[3] = src1_row.nb[1];
+    dst_row.nb[3] = dst_row.nb[1];
+
+    src1_row.extra = &src1_row_extra;
+    dst_row.extra = &dst_row_extra;
+
+
+    for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
+        //int32_t row_id;
+        //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+        //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+
+        const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
+
+        GGML_ASSERT(row_id >= 0 && row_id < n_as);
+
+        const struct ggml_tensor * src0_row = dst->src[row_id + 2];
+
+        src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
+        src1_row.data = (char *) src1->data + i01*src1->nb[1];
+
+        dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
+        dst_row.data = (char *) dst->data + i01*dst->nb[1];
+
+        ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
+}
+
+static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
+}
+
+static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
+                          ggml_tensor *dst) try {
+    const int64_t ne = ggml_nelements(src0);
+    GGML_ASSERT(ne == ggml_nelements(src1));
+
+    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
+
+    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
+    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    GGML_ASSERT(src0->ne[3] == 1);
+
+    const int64_t nb00 = src0->nb[0];
+    const int64_t nb01 = src0->nb[1];
+    const int64_t nb02 = src0->nb[2];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+    GGML_ASSERT(src1->ne[3] == 1);
+
+    const int64_t nb10 = src1->nb[0];
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+
+    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+
+    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
+
+    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
+        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
+        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
+        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
+        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
+        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else {
+        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
+                ggml_type_name(src0->type), ggml_type_name(src1->type));
+        GGML_ASSERT(false);
+    }
+
+    (void) dst;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    // TODO: why do we pass dst as src1 here?
+    ggml_cuda_cpy(src0, dst, nullptr);
+    (void) src1;
+}
+
+static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
+}
+
+static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
+}
+
+static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
+}
+
+static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
+}
+
+static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
+}
+
+static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
+}
+
+static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_ASSERT(ggml_is_contiguous(src0));
+    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
+}
+
+static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    (void) src0;
+    (void) src1;
+    (void) dst;
+}
+
+static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
+}
+
+void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
+    const int64_t nrows = ggml_nrows(tensor);
+
+    const int64_t ne0 = tensor->ne[0];
+
+    const size_t nb1 = tensor->nb[1];
+
+    ggml_backend_type backend = tensor->backend;
+    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
+    memset(extra, 0, sizeof(*extra));
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (backend == GGML_BACKEND_GPU && id != g_main_device) {
+            continue;
+        }
+
+        ggml_cuda_set_device(id);
+
+        int64_t row_low, row_high;
+        if (backend == GGML_BACKEND_GPU) {
+            row_low = 0;
+            row_high = nrows;
+        } else if (backend == GGML_BACKEND_GPU_SPLIT) {
+            const int64_t rounding = get_row_rounding(tensor->type);
+
+            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
+            row_low -= row_low % rounding;
+
+            if (id == g_device_count - 1) {
+                row_high = nrows;
+            } else {
+                row_high = nrows*g_tensor_split[id + 1];
+                row_high -= row_high % rounding;
+            }
+        } else {
+            GGML_ASSERT(false);
+        }
+        if (row_low == row_high) {
+            continue;
+        }
+
+        int64_t nrows_split = row_high - row_low;
+
+        const size_t offset_split = row_low*nb1;
+        size_t size = ggml_nbytes_split(tensor, nrows_split);
+        const size_t original_size = size;
+
+        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+
+        char * buf;
+        CUDA_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
+                                        size, dpct::get_in_order_queue())));
+        char * buf_host = (char*)data + offset_split;
+
+        // set padding to 0 to avoid possible NaN values
+        if (size > original_size) {
+            CUDA_CHECK(DPCT_CHECK_ERROR(
+                dpct::get_in_order_queue()
+                    .memset(buf + original_size, 0, size - original_size)
+                    .wait()));
+        }
+
+        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                                        .memcpy(buf, buf_host, original_size)
+                                        .wait()));
+
+        extra->data_device[id] = buf;
+
+        if (backend == GGML_BACKEND_GPU_SPLIT) {
+            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+                CUDA_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
+                                                new sycl::event()));
+            }
+        }
+    }
+
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
+    if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+
+    for (int64_t id = 0; id < g_device_count; ++id) {
+        if (extra->data_device[id] != nullptr) {
+            CUDA_CHECK(ggml_cuda_set_device(id));
+            CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(
+                extra->data_device[id], dpct::get_in_order_queue())));
+        }
+
+        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+            if (extra->events[id][is] != nullptr) {
+                CUDA_CHECK(ggml_cuda_set_device(id));
+                CUDA_CHECK(DPCT_CHECK_ERROR(
+                    dpct::destroy_event(extra->events[id][is])));
+            }
+        }
+    }
+
+    delete extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static size_t g_temp_tensor_extra_index = 0;
+
+static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+    if (g_temp_tensor_extras == nullptr) {
+        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
+    }
+
+    size_t alloc_index = g_temp_tensor_extra_index;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
+    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+    memset(extra, 0, sizeof(*extra));
+
+    return extra;
+}
+
+static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
+                                          bool scratch, bool force_inplace,
+                                          bool no_alloc) try {
+    if (scratch && g_scratch_size == 0) {
+        return;
+    }
+
+    tensor->backend = GGML_BACKEND_GPU;
+
+    // recursively assign CUDA buffers until a compute tensor is found
+    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
+        const ggml_op src0_op = tensor->src[0]->op;
+        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
+            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
+        }
+    }
+    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
+        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
+    }
+
+    if (scratch && no_alloc) {
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra;
+
+    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
+        tensor->op == GGML_OP_VIEW ||
+        force_inplace;
+    const size_t size = ggml_nbytes(tensor);
+
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t offset = 0;
+        if (tensor->op == GGML_OP_VIEW) {
+            memcpy(&offset, tensor->op_params, sizeof(size_t));
+        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src0_ddc + offset;
+    } else if (tensor->op == GGML_OP_CPY) {
+        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
+        void * src1_ddv = src1_extra->data_device[g_main_device];
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = src1_ddv;
+    } else if (scratch) {
+        GGML_ASSERT(size <= g_scratch_size);
+        if (g_scratch_offset + size > g_scratch_size) {
+            g_scratch_offset = 0;
+        }
+
+        char * data = (char *) g_scratch_buffer;
+        if (data == nullptr) {
+            CUDA_CHECK(DPCT_CHECK_ERROR(
+                data = (char *)sycl::malloc_device(
+                    g_scratch_size, dpct::get_in_order_queue())));
+            g_scratch_buffer = data;
+        }
+        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra->data_device[g_main_device] = data + g_scratch_offset;
+
+        g_scratch_offset += size;
+
+        GGML_ASSERT(g_scratch_offset <= g_scratch_size);
+    } else { // allocate new buffers outside of scratch
+        void * data;
+        CUDA_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
+                                        size, dpct::get_in_order_queue())));
+        CUDA_CHECK(DPCT_CHECK_ERROR(
+            dpct::get_in_order_queue().memset(data, 0, size).wait()));
+        extra = new ggml_tensor_extra_gpu;
+        memset(extra, 0, sizeof(*extra));
+        extra->data_device[g_main_device] = data;
+    }
+
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
+                                     size_t offset) try {
+    if (g_scratch_size == 0) {
+        return;
+    }
+    if (g_scratch_buffer == nullptr) {
+        ggml_cuda_set_device(g_main_device);
+        CUDA_CHECK(
+            DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
+                                 g_scratch_size, dpct::get_in_order_queue())));
+    }
+
+    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
+
+    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
+        tensor->op == GGML_OP_VIEW;
+
+    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
+        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        size_t view_offset = 0;
+        if (tensor->op == GGML_OP_VIEW) {
+            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
+        }
+        extra->data_device[g_main_device] = src0_ddc + view_offset;
+    } else {
+        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
+    }
+
+    tensor->extra = extra;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try {
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+    GGML_ASSERT(ggml_is_contiguous(tensor));
+
+    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                                    .memcpy(extra->data_device[g_main_device],
+                                            tensor->data, ggml_nbytes(tensor))
+                                    .wait()));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
+}
+
+void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, true, false, true);
+}
+
+void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, false, false, false);
+}
+
+void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
+    ggml_cuda_assign_buffers_impl(tensor, false, true, false);
+}
+
+void ggml_cuda_set_main_device(const int main_device) try {
+    if (main_device >= g_device_count) {
+        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
+                main_device, g_device_count, g_main_device);
+        return;
+    }
+
+    if (g_main_device != main_device && g_device_count > 1) {
+        g_main_device = main_device;
+        dpct::device_info prop;
+        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(g_main_device))));
+        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
+                g_main_device, prop.get_name());
+    }
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_set_scratch_size(const size_t scratch_size) {
+    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
+    // it still won't always work as expected, but it's better than nothing
+    if (scratch_size > g_scratch_size) {
+        ggml_cuda_free_scratch();
+    }
+    g_scratch_size = std::max(g_scratch_size, scratch_size);
+}
+
+void ggml_cuda_free_scratch() try {
+    if (g_scratch_buffer == nullptr) {
+        return;
+    }
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(
+        sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
+    g_scratch_buffer = nullptr;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+    if (!g_cublas_loaded) return false;
+
+    ggml_cuda_func_t func;
+    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
+        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
+        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
+
+    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
+        return false;
+    }
+
+    if (tensor->op == GGML_OP_MUL_MAT) {
+        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
+#ifndef NDEBUG
+            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
+#endif
+            return false;
+        }
+    }
+
+    switch (tensor->op) {
+        case GGML_OP_REPEAT:
+            func = ggml_cuda_repeat;
+            break;
+        case GGML_OP_GET_ROWS:
+            func = ggml_cuda_get_rows;
+            break;
+        case GGML_OP_DUP:
+            func = ggml_cuda_dup;
+            break;
+        case GGML_OP_ADD:
+            func = ggml_cuda_add;
+            break;
+        case GGML_OP_ACC:
+            func = ggml_cuda_acc;
+            break;
+        case GGML_OP_MUL:
+            func = ggml_cuda_mul;
+            break;
+        case GGML_OP_DIV:
+            func = ggml_cuda_div;
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(tensor)) {
+                case GGML_UNARY_OP_GELU:
+                    func = ggml_cuda_gelu;
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    func = ggml_cuda_silu;
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    func = ggml_cuda_gelu_quick;
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    func = ggml_cuda_tanh;
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    func = ggml_cuda_relu;
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            func = ggml_cuda_norm;
+            break;
+        case GGML_OP_GROUP_NORM:
+            func = ggml_cuda_group_norm;
+            break;
+        case GGML_OP_CONCAT:
+            func = ggml_cuda_concat;
+            break;
+        case GGML_OP_UPSCALE:
+            func = ggml_cuda_upscale;
+            break;
+        case GGML_OP_PAD:
+            func = ggml_cuda_pad;
+            break;
+        case GGML_OP_LEAKY_RELU:
+            func = ggml_cuda_leaky_relu;
+            break;
+        case GGML_OP_RMS_NORM:
+            func = ggml_cuda_rms_norm;
+            break;
+        case GGML_OP_MUL_MAT:
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_cuda_mul_mat;
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
+                return false;
+            }
+            func = ggml_cuda_mul_mat_id;
+            break;
+        case GGML_OP_SCALE:
+            func = ggml_cuda_scale;
+            break;
+        case GGML_OP_SQR:
+            func = ggml_cuda_sqr;
+            break;
+        case GGML_OP_CLAMP:
+            func = ggml_cuda_clamp;
+            break;
+        case GGML_OP_CPY:
+            func = ggml_cuda_cpy;
+            break;
+        case GGML_OP_CONT:
+            func = ggml_cuda_dup;
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            func = ggml_cuda_nop;
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            func = ggml_cuda_diag_mask_inf;
+            break;
+        case GGML_OP_SOFT_MAX:
+            func = ggml_cuda_soft_max;
+            break;
+        case GGML_OP_ROPE:
+            func = ggml_cuda_rope;
+            break;
+        case GGML_OP_ALIBI:
+            func = ggml_cuda_alibi;
+            break;
+        case GGML_OP_IM2COL:
+            func = ggml_cuda_im2col;
+            break;
+        case GGML_OP_SUM_ROWS:
+            func = ggml_cuda_sum_rows;
+            break;
+        case GGML_OP_ARGSORT:
+            func = ggml_cuda_argsort;
+            break;
+        default:
+            return false;
+    }
+
+    if (params->ith != 0) {
+        return true;
+    }
+    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
+        return true;
+    }
+    func(tensor->src[0], tensor->src[1], tensor);
+    return true;
+}
+
+int ggml_cuda_get_device_count() try {
+    int device_count;
+    if (DPCT_CHECK_ERROR(device_count =
+                             dpct::dev_mgr::instance().device_count()) != 0) {
+        return 0;
+    }
+    return device_count;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+void ggml_cuda_get_device_description(int device, char *description,
+                                      size_t description_size) try {
+    dpct::device_info prop;
+    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+        prop, dpct::dev_mgr::instance().get_device(device))));
+    snprintf(description, description_size, "%s", prop.get_name());
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+#define UNUSED GGML_UNUSED
+
+// cuda buffer
+
+struct ggml_backend_buffer_context_cuda {
+    int device;
+    void * dev_ptr = nullptr;
+    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
+    size_t temp_tensor_extra_index = 0;
+
+    ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
+
+    ~ggml_backend_buffer_context_cuda() {
+        delete[] temp_tensor_extras;
+    }
+
+    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+        if (temp_tensor_extras == nullptr) {
+            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
+        }
+
+        size_t alloc_index = temp_tensor_extra_index;
+        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
+        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
+        memset(extra, 0, sizeof(*extra));
+
+        return extra;
+    }
+};
+
+static void
+ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
+    delete ctx;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    return ctx->dev_ptr;
+}
+
+static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
+                                                 ggml_tensor *tensor) try {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+    if (tensor->view_src != NULL && tensor->view_offs == 0) {
+        assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
+        tensor->backend = tensor->view_src->backend;
+        tensor->extra = tensor->view_src->extra;
+        return;
+    }
+
+    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
+
+    extra->data_device[ctx->device] = tensor->data;
+
+    tensor->backend = GGML_BACKEND_GPU;
+    tensor->extra = extra;
+
+    if (ggml_is_quantized(tensor->type)) {
+        // initialize padding to 0 to avoid possible NaN values
+        int64_t row_low = 0;
+        int64_t row_high = ggml_nrows(tensor);
+        int64_t nrows_split = row_high - row_low;
+
+        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
+
+        if (padded_size > original_size && tensor->view_src == nullptr) {
+            CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
+                (char *)tensor->data + original_size, 0,
+                padded_size - original_size)));
+        }
+    }
+
+    UNUSED(buffer);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                                ggml_tensor *tensor,
+                                                const void *data, size_t offset,
+                                                size_t size) try {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+                             .memcpy((char *)tensor->data + offset, data, size)
+                             .wait()));
+
+    UNUSED(buffer);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                                const ggml_tensor *tensor,
+                                                void *data, size_t offset,
+                                                size_t size) try {
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(
+        dpct::get_in_order_queue()
+            .memcpy(data, (const char *)tensor->data + offset, size)
+            .wait()));
+
+    UNUSED(buffer);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
+    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
+    /* .cpy_tensor_from = */ NULL,
+    /* .cpy_tensor_to   = */ NULL,
+};
+
+// cuda buffer type
+
+static ggml_backend_buffer_t
+ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                           size_t size) try {
+    int device = (int) (intptr_t) buft->context;
+
+    ggml_cuda_set_device(device);
+
+    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
+
+    void * dev_ptr;
+    CUDA_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
+                                    size, dpct::get_in_order_queue())));
+
+    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
+
+    return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    return 128;
+
+    UNUSED(buft);
+}
+
+static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
+    int64_t row_low = 0;
+    int64_t row_high = ggml_nrows(tensor);
+    int64_t nrows_split = row_high - row_low;
+
+    size_t size = ggml_nbytes_split(tensor, nrows_split);
+
+    int64_t ne0 = tensor->ne[0];
+
+    if (ggml_is_quantized(tensor->type)) {
+        if (ne0 % MATRIX_ROW_PADDING != 0) {
+            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+        }
+    }
+
+    return size;
+
+    UNUSED(buft);
+}
+
+static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    return ggml_backend_is_cuda(backend);
+
+    UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
+    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
+    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
+    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
+};
+
+ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
+    static bool ggml_backend_buffer_type_cuda_initialized = false;
+    if (!ggml_backend_buffer_type_cuda_initialized) {
+        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
+            ggml_backend_buffer_type_cuda[i] = {
+                /* .iface    = */ cuda_backend_buffer_type_interface,
+                /* .context  = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
+            };
+        }
+        ggml_backend_buffer_type_cuda_initialized = true;
+    }
+
+    return &ggml_backend_buffer_type_cuda[device];
+}
+
+// host buffer type
+
+static void
+ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    CUDA_CHECK(
+        DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
+    delete ctx;
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static ggml_backend_buffer_t
+ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+                                                size_t size) try {
+    void * ptr;
+    CUDA_CHECK(DPCT_CHECK_ERROR(
+        ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue())));
+
+    // FIXME: this is a hack to avoid having to implement a new buffer type
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
+
+    return buffer;
+
+    UNUSED(buft);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
+    /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+    /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+    /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
+};
+
+ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
+        /* .iface    = */ cuda_backend_host_buffer_type_interface,
+        /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_buffer_type_cuda_host;
+}
+
+// backend
+
+struct ggml_backend_context_cuda {
+    int device;
+};
+
+static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
+    return GGML_CUDA_NAME;
+
+    UNUSED(backend);
+}
+
+static void ggml_backend_cuda_free(ggml_backend_t backend) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    delete cuda_ctx;
+    delete backend;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
+}
+
+static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
+                                               ggml_tensor *tensor,
+                                               const void *data, size_t offset,
+                                               size_t size) try {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+        (char *)tensor->data + offset, data, size)));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
+                                               const ggml_tensor *tensor,
+                                               void *data, size_t offset,
+                                               size_t size) try {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+        data, (const char *)tensor->data + offset, size)));
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
+
+    UNUSED(backend);
+}
+catch (sycl::exception const &exc) {
+  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
+            << ", line:" << __LINE__ << std::endl;
+  std::exit(1);
+}
+
+static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    GGML_ASSERT(!"not implemented");
+
+    return nullptr;
+
+    UNUSED(backend);
+    UNUSED(cgraph);
+}
+
+static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+    GGML_ASSERT(!"not implemented");
+
+    UNUSED(backend);
+    UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+
+    ggml_cuda_set_main_device(cuda_ctx->device);
+
+    ggml_compute_params params = {};
+    params.type = GGML_TASK_COMPUTE;
+    params.ith = 0;
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+
+        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
+            continue;
+
+        assert(node->backend == GGML_BACKEND_GPU);
+        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+        assert(node->extra != nullptr);
+
+        for (int j = 0; j < GGML_MAX_SRC; j++) {
+            if (node->src[j] != nullptr) {
+                assert(node->src[j]->backend == GGML_BACKEND_GPU);
+                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+                assert(node->src[j]->extra != nullptr);
+            }
+        }
+
+        bool ok = ggml_cuda_compute_forward(&params, node);
+        if (!ok) {
+            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+        GGML_ASSERT(ok);
+
+#if 0
+        if (node->type == GGML_TYPE_F32) {
+            cudaDeviceSynchronize();
+            std::vector<float> tmp(ggml_nelements(node), 0.0f);
+            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
+            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
+                ggml_type_name(node->src[0]->type),
+                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
+                node->src[0]->name,
+                node->src[1] ? node->src[1]->name : "none");
+            double sum = 0.0;
+            double sq_sum = 0.0;
+            for (int i = 0; i < ggml_nelements(node); i++) {
+                printf("%f ", tmp[i]);
+                sum += tmp[i];
+                sq_sum += tmp[i]*tmp[i];
+            }
+            printf("\n");
+            printf("sum: %f, ", sum);
+            printf("sq_sum: %f\n", sq_sum);
+        }
+#endif
+    }
+
+    UNUSED(backend);
+}
+
+static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+    switch (op->op) {
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(op)) {
+                case GGML_UNARY_OP_GELU:
+                case GGML_UNARY_OP_SILU:
+                case GGML_UNARY_OP_RELU:
+                case GGML_UNARY_OP_GELU_QUICK:
+                case GGML_UNARY_OP_TANH:
+                    return true;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_MUL_MAT:
+        case GGML_OP_MUL_MAT_ID:
+            {
+                struct ggml_tensor * a;
+                struct ggml_tensor * b;
+                if (op->op == GGML_OP_MUL_MAT) {
+                    a = op->src[0];
+                    b = op->src[1];
+                } else {
+                    a = op->src[2];
+                    b = op->src[1];
+                }
+                if (a->ne[3] != b->ne[3]) {
+                    return false;
+                }
+                return true;
+            } break;
+        case GGML_OP_GET_ROWS:
+            {
+                switch (op->src[0]->type) {
+                    case GGML_TYPE_F16:
+                    case GGML_TYPE_F32:
+                    case GGML_TYPE_Q4_0:
+                    case GGML_TYPE_Q4_1:
+                    case GGML_TYPE_Q5_0:
+                    case GGML_TYPE_Q5_1:
+                    case GGML_TYPE_Q8_0:
+                        return true;
+                    default:
+                        return false;
+                }
+            } break;
+        case GGML_OP_CPY:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                ggml_type src1_type = op->src[1]->type;
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
+                    return true;
+                }
+                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
+                    return true;
+                }
+                return false;
+            } break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+        case GGML_OP_NORM:
+        case GGML_OP_REPEAT:
+        case GGML_OP_DUP:
+        case GGML_OP_ADD:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_SCALE:
+        case GGML_OP_SQR:
+        case GGML_OP_CLAMP:
+        case GGML_OP_CONT:
+        case GGML_OP_DIAG_MASK_INF:
+        case GGML_OP_SOFT_MAX:
+        case GGML_OP_ROPE:
+        case GGML_OP_ALIBI:
+        case GGML_OP_IM2COL:
+        case GGML_OP_SUM_ROWS:
+        case GGML_OP_ARGSORT:
+        case GGML_OP_ACC:
+        case GGML_OP_CONCAT:
+        case GGML_OP_GROUP_NORM:
+        case GGML_OP_UPSCALE:
+        case GGML_OP_PAD:
+        case GGML_OP_LEAKY_RELU:
+            return true;
+        default:
+            return false;
+    }
+
+    UNUSED(backend);
+}
+
+static ggml_backend_i cuda_backend_i = {
+    /* .get_name                = */ ggml_backend_cuda_name,
+    /* .free                    = */ ggml_backend_cuda_free,
+    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
+    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
+    /* .cpy_tensor_from_async   = */ NULL,
+    /* .cpy_tensor_to_async     = */ NULL,
+    /* .synchronize             = */ ggml_backend_cuda_synchronize,
+    /* .graph_plan_create       = */ ggml_backend_cuda_graph_plan_create,
+    /* .graph_plan_free         = */ ggml_backend_cuda_graph_plan_free,
+    /* .graph_plan_compute      = */ ggml_backend_cuda_graph_plan_compute,
+    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
+    /* .supports_op             = */ ggml_backend_cuda_supports_op,
+};
+
+ggml_backend_t ggml_backend_cuda_init(int device) {
+    ggml_init_cublas(); // TODO: remove from ggml.c
+
+    if (device < 0 || device >= ggml_cuda_get_device_count()) {
+        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
+        return nullptr;
+    }
+
+    // not strictly necessary, but it may reduce the overhead of the first graph_compute
+    ggml_cuda_set_main_device(device);
+
+    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
+        /* .device = */ device
+    };
+
+    ggml_backend_t cuda_backend = new ggml_backend {
+        /* .interface = */ cuda_backend_i,
+        /* .context   = */ ctx
+    };
+
+    return cuda_backend;
+}
+
+bool ggml_backend_is_cuda(ggml_backend_t backend) {
+    return backend->iface.get_name == ggml_backend_cuda_name;
+}
+
+static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
+    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
+    return cuda_backend;
+
+    UNUSED(params);
+}
+
+extern "C" int ggml_backend_cuda_reg_devices();
+
+int ggml_backend_cuda_reg_devices() {
+    int device_count = ggml_cuda_get_device_count();
+    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
+    printf("device count=%d\n", device_count);
+    for (int i = 0; i < device_count; i++) {
+        char name[128];
+        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
+        printf("register name=%s\n", name);
+        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
+    }
+    return device_count;
+}
diff --git a/migrate.sh b/migrate.sh
new file mode 100755
index 0000000000000..93697c6b01579
--- /dev/null
+++ b/migrate.sh
@@ -0,0 +1,18 @@
+echo "modify the ggml-sycl.cpp to fix dpct result error"
+TARGET_FILE=ggml-sycl.cpp
+sed -i "s/CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());/id = dpct::dev_mgr::instance().current_device_id();/g" ${TARGET_FILE}
+sed -i "s/CUDA_CHECK(current_device = dpct::dev_mgr::instance().current_device_id());/current_device = dpct::dev_mgr::instance().current_device_id();/g" ${TARGET_FILE}
+sed -i "s/g_cublas_handles, oneapi::mkl::transpose::trans,/*g_cublas_handles[id], oneapi::mkl::transpose::trans,/g" ${TARGET_FILE}
+sed -i "s/cu_compute_type = CUBLAS_COMPUTE_16F;/cu_compute_type = dpct::library_data_t::real_half;/g" ${TARGET_FILE}
+sed -i "s/cu_compute_type = CUBLAS_COMPUTE_32F;/cu_compute_type = dpct::library_data_t::real_float;/g" ${TARGET_FILE}
+sed -i "s/tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs, stream_ct1);/tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);/g" ${TARGET_FILE}
+sed -i "s/cuGetErrorString(err, &err_str);/\/\/cuGetErrorString(err, &err_str);/g" ${TARGET_FILE}
+
+#set empty function
+#ggml_cuda_set_peer_access
+#ggml_cuda_pool_malloc_vmm
+#ggml_cuda_pool_free_vmm
+
+#replace tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs, stream_ct1);/tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+
+echo "done"
\ No newline at end of file
diff --git a/run.sh b/run.sh
new file mode 100755
index 0000000000000..6ac5b2a51edac
--- /dev/null
+++ b/run.sh
@@ -0,0 +1,12 @@
+INPUT1="The process of Origami seems simple at the first glance, but in fact, it still requires a very complicated process to do it well. Taking folding a rose as an example, we can divide the entire process into three stages, including: firstly creating a grid of creases, secondly making a three-dimensional base, and thirdly finishing petal decoration. The first step is to create a grid of creases: this step is a bit like the first step of folding a gift of thousand-paper-crane. That is to say, we can fold the paper in half (or namedly equal-folds) through the symmetrical axis, and repeat such step in the other symmetrical axis. And then apply multiple equal-folds in sequence relative to each smaller rectangle divided by the two creases; After that, the creases in each direction will interweave into a complete set of uniform small square splicing patterns; these small squares form a reference space similar to a two-dimensional coordinate system, allowing us to combine adjacent creases on the plane from Three-dimensional high platforms or depressions are folded on the two-dimensional small squares to facilitate the next steps of folding. It should be noted that, in the process of creating grid creases, there may be rare cases when the folds are not aligned. The consequences of this error can be very serious. And just like the butterfly effect, it is only a slight difference at the beginning , and in the end it may generate a disaster world which is completely different from plan. Anyway, let's continue. The second step is make the three-dimensional base: In this step, we need to fold a set of symmetrical three-dimensional high platforms or depressions based on the grid creases. From the symmetry analysis, it is not difficult to find that the rose will have four symmetrical three-dimensional high platforms and supporting depressions. Therefore, we can firstly fold out a quarter of the depression and plateau patterns, which would help build a base to compose into a complex 3D structure. And then, we use this quarter as a template, and fold out the repeating patterns on the remaining three parts of the whole structure in turn. It is worth noting that the layout of the high platform not only needs to consider the regular contrast and symmetrical distribution of the length and width, but also needs to ensure the orderliness of the height dimension. This is very important, since we will never go back to this step after all parts were made, and you would better start from first step if you make anything wrong in the this step. Similar to the precautions in the first stage, please handle all the corners in three dimensions to ensure that they conform to the layout required in the plan, which would help us avoid the butterfly effect and increase the robustness in the process of three-dimensional folding. Just like building a skyscrapper in the real world, people usually take a lot of time when building the base but soon get finished when extending the structure after that. Time is worth to cost in the base, but would be saved in the future after you succeed in base. Anyway, let's continue. During the first quarter of the pattern, repeated comparisons with the finished rose were made to eliminate any possible errors in the first place. The final stage is to finish the petal grooming. At this stage, we often emphasize an important term called folding-by-heart. The intention here is no longer literally serious, but focus is moved to our understanding of the shape of a rose in nature, and we usually use natural curves to continuously correct the shape of petals in order to approach the shape of rose petals in reality. One more comment: this is also the cause of randomness to the art, which can be generated differently by different people. Recall that rose should be adjusted close to reality, so in the last step of this stage, we need to open the bloom in the center of the rose, by pulling on the four petals that have been bent. This process may be accompanied by the collapse of the overall structure of the rose, so we should be very careful to save strength of adjustment, and it must be well controlled to avoid irreversible consequences. Ultimately, after three stages of folding, we end up with a crown of rose with a similar shape close to reality. If condition is permited, we can wrap a green paper strip twisted on a straightened iron wire, and insert the rose crown we just created onto one side of the iron wire. In this way, we got a hand-made rose with a green stem. We can also repeat the steps above to increase the number of rose, so that it can be made into a cluster. Different color of rose is usually more attractive and can be considered as a better plan of gift to your friend. In summary, by creating a grid of creases, making a three-dimensional base, and finishing with petals, we created a three-dimensional rose from a two-dimensional paper. Although this process may seem simple, it is indeed a work of art created by us humans with the help of imagination and common materials. At last, Please comment to assess the above content."
+
+#echo ${INPUT1}
+
+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+source /opt/intel/oneapi/setvars.sh
+
+export GGML_SYCL_DEVIC=0
+export GGML_SYCL_DEBUG=1
+#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
+
diff --git a/setup.sh b/setup.sh
new file mode 100755
index 0000000000000..c7e6d47beba15
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,7 @@
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#cmake .. -DLLAMA_CLBLAST=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake --build . --config Release -v

From ff83711055e7b1ac4f3423455380a7a9190f7630 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Fri, 29 Dec 2023 17:25:40 +0800
Subject: [PATCH 04/90] step 1

---
 ggml-sycl.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 5336ee3d27fdf..7b502b3b4f54d 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -122,7 +122,7 @@ static int g_ggml_sycl_debug=0;
 //#define GGML_SYCL_DEBUG(...) (if(g_ggml_sycl_debug) printf(__VA_ARGS__))
 #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
 
-#define MIN_CC_DP4A   510 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
+#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #define CC_VOLTA      700
 #define CC_OFFSET_AMD 1000000
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
@@ -246,7 +246,7 @@ static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
     }
 #endif // CUDART_VERSION >= 12000
 
-[[noreturn]]
+
 static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
     fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);
     fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
@@ -11043,7 +11043,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                 GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_vec_q path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
             } else {
-                GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_dequantize_mul_mat_vec path\n");
+                // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_dequantize_mul_mat_vec path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
             }
         } else {
@@ -11059,7 +11059,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                 GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_q path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
             } else {
-                GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_cublas path\n");
+                //GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_cublas path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
             }
         }

From 02dffb68b874d12ee41da407a73cbbdfa14414e6 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Fri, 29 Dec 2023 17:43:22 +0800
Subject: [PATCH 05/90] step 2

---
 ggml-sycl.cpp | 35 +++++++++--------------------------
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 7b502b3b4f54d..08aef929adc02 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -219,33 +219,16 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
 
 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
-#if DPCT_COMPAT_RT_VERSION >= 12000
-    static const char *cublas_get_error_str(const int err) {
-        /*
-        DPCT1009:48: SYCL uses exceptions to report errors and does not use the
-        error codes. The original code was commented out and a warning string
-        was inserted. You need to rewrite this code.
-        */
-        return "cublasGetStatusString is not supported" /*cublasGetStatusString(err)*/
-            ;
-    }
-#else
-    static const char * cublas_get_error_str(const cublasStatus_t err) {
-        switch (err) {
-            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
-            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
-            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
-            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
-            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
-            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
-            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
-            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
-            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
-            default: return "unknown error";
-        }
-    }
-#endif // CUDART_VERSION >= 12000
 
+static const char *cublas_get_error_str(const int err) {
+    /*
+    DPCT1009:48: SYCL uses exceptions to report errors and does not use the
+    error codes. The original code was commented out and a warning string
+    was inserted. You need to rewrite this code.
+    */
+    return "cublasGetStatusString is not supported" /*cublasGetStatusString(err)*/
+        ;
+}
 
 static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
     fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);

From 43f2c358592003d4a91c0fd49751250bbdc4856d Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sun, 31 Dec 2023 12:38:57 +0800
Subject: [PATCH 06/90] step3 add fp16, slower 31->28

---
 ggml-sycl.cpp | 811 +++++++-------------------------------------------
 setup.sh      |   2 +
 2 files changed, 107 insertions(+), 706 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 08aef929adc02..5a92185c03305 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -1,7 +1,3 @@
-#define DPCT_PROFILING_ENABLED
-#define DPCT_COMPAT_RT_VERSION 12010
-#include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
 #include <algorithm>
 #include <assert.h>
 #include <atomic>
@@ -13,113 +9,21 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <vector>
-#include <dpct/blas_utils.hpp>
-
-#if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  HIPBLAS_R_16F
-#define CUDA_R_32F  HIPBLAS_R_32F
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
-#define cublasCreate hipblasCreate
-#define cublasGemmEx hipblasGemmEx
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetStream hipblasSetStream
-#define cublasSgemm hipblasSgemm
-#define cublasStatus_t hipblasStatus_t
-#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
-#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaError_t hipError_t
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEvent_t hipEvent_t
-#define cudaEventDestroy hipEventDestroy
-#define cudaFree hipFree
-#define cudaFreeHost hipHostFree
-#define cudaGetDevice hipGetDevice
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#ifdef GGML_HIP_UMA
-#define cudaMalloc hipMallocManaged
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
-#else
-#define cudaMalloc hipMalloc
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-#endif
-#define cudaMemcpy hipMemcpy
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaMemcpyKind hipMemcpyKind
-#define cudaMemset hipMemset
-#define cudaMemsetAsync hipMemsetAsync
-#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-#define cudaSetDevice hipSetDevice
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamFireAndForget hipStreamFireAndForget
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#define __trap abort
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-#else
+#include <cmath>
 
-#if DPCT_COMPAT_RT_VERSION < 11020
-#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-#endif // CUDART_VERSION < 11020
 
-#endif // defined(GGML_USE_HIPBLAS)
+#include <sycl/sycl.hpp>
+#include <sycl/half_type.hpp>
+#include <dpct/dpct.hpp>
+#include <dpct/blas_utils.hpp>
+#include <dpct/lib_common_utils.hpp>
 
 #include "ggml-cuda.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
-#include <cmath>
-
-#include <dpct/lib_common_utils.hpp>
 
 static int g_ggml_sycl_debug=0;
 
-//#define GGML_SYCL_DEBUG(...) (if(g_ggml_sycl_debug) printf(__VA_ARGS__))
 #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
 
 #define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
@@ -147,71 +51,6 @@ static int g_ggml_sycl_debug=0;
 // max batch size to use MMQ kernels when tensor cores are available
 #define MMQ_MAX_BATCH_SIZE 32
 
-#if defined(GGML_USE_HIPBLAS)
-#define __CUDA_ARCH__ 1300
-
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-    defined(__gfx1150__) || defined(__gfx1151__)
-#define RDNA3
-#endif
-
-#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
-#define RDNA2
-#endif
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-#if __has_builtin(__builtin_elementwise_sub_sat)
-    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
-    return reinterpret_cast<const int&>(c);
-#else
-    int8x4_t c;
-    int16_t tmp;
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-        tmp = va[i] - vb[i];
-        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
-        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
-        c[i] = tmp;
-    }
-    return reinterpret_cast<int&>(c);
-#endif // __has_builtin(__builtin_elementwise_sub_sat)
-}
-
-static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
-    c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(__gfx1100__)
-    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
-#elif defined(__gfx1010__) || defined(__gfx900__)
-    int tmp1;
-    int tmp2;
-    asm("\n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        "
-        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
-        : "v"(a), "v"(b)
-    );
-#else
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
-#endif
-    return c;
-}
-#endif // defined(GGML_USE_HIPBLAS)
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -257,7 +96,7 @@ You need to rewrite this code.
              ggml_cuda_error(#err, __func__, __FILE__, __LINE__,               \
                              cublas_get_error_str(err_)); } while (0)
 
-#if !defined(GGML_USE_HIPBLAS)
+
 static const char *cu_get_error_str(int err) {
     const char * err_str;
     /*
@@ -276,7 +115,7 @@ DPCT1000:68: Error handling if-stmt was detected but could not be rewritten.
     do { auto err_ = (err);                                                    \
          if (err_ != 0) ggml_cuda_error(#err, __func__, __FILE__, __LINE__,    \
                                         cu_get_error_str(err_)); } while (0)
-#endif
+
 
 #if DPCT_COMPAT_RT_VERSION >= 11100
 #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
@@ -284,13 +123,14 @@ DPCT1000:68: Error handling if-stmt was detected but could not be rewritten.
 #define GGML_CUDA_ASSUME(x)
 #endif // CUDART_VERSION >= 11100
 
-#ifdef GGML_CUDA_F16
-typedef half dfloat; // dequantize float
-typedef half2 dfloat2;
+#ifdef GGML_SYCL_F16
+typedef sycl::half dfloat; // dequantize float
+typedef sycl::half2 dfloat2;
 #else
 typedef float dfloat; // dequantize float
 typedef sycl::float2 dfloat2;
-#endif //GGML_CUDA_F16
+#endif //GGML_SYCL_F16
+
 
 static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) {
     const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
@@ -1080,13 +920,16 @@ static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
     v.x() = vui & 0xF;
     v.y() = vui >> 4;
 
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {8.0f, 8.0f});
-    v = __hmul2(v, {d, d});
+#ifdef GGML_SYCL_F16
+    // v = v - {8.0f, 8.0f};
+    // v = v * {d, d};
+    v.s0() = (v.s0() - 8.0f) * d;
+    v.s1() = (v.s1() - 8.0f) * d;
+
 #else
     v.x() = (v.x() - 8.0f) * d;
     v.y() = (v.y() - 8.0f) * d;
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 }
 
 static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
@@ -1101,13 +944,16 @@ static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
     v.x() = vui & 0xF;
     v.y() = vui >> 4;
 
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
+#ifdef GGML_SYCL_F16
+    // v = v * {d, d};
+    // v = v + {m, m};
+    v.s0() = (v.s0() * d) + m;
+    v.s1() = (v.s1() * d) + m;
+
 #else
     v.x() = (v.x() * d) + m;
     v.y() = (v.y() * d) + m;
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 }
 
 static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
@@ -1125,13 +971,16 @@ static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
     v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
     v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
 
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {16.0f, 16.0f});
-    v = __hmul2(v, {d, d});
+#ifdef GGML_SYCL_F16
+    // v = v - {16.0f, 16.0f};
+    // v = v * {d, d};
+    v.s0() = (v.s0() - 16.0f) * d;
+    v.s1() = (v.s1() - 16.0f) * d;
+
 #else
     v.x() = (v.x() - 16.0f) * d;
     v.y() = (v.y() - 16.0f) * d;
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 }
 
 static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
@@ -1150,13 +999,15 @@ static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
     v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
     v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
 
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
+#ifdef GGML_SYCL_F16
+    // v = v * {d, d};
+    // v = v + {m, m};
+    v.s0() = (v.s0() * d) + m;
+    v.s1() = (v.s1() * d) + m;
 #else
     v.x() = (v.x() * d) + m;
     v.y() = (v.y() * d) + m;
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 }
 
 static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
@@ -1168,12 +1019,14 @@ static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
     v.x() = x[ib].qs[iqs + 0];
     v.y() = x[ib].qs[iqs + 1];
 
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
+#ifdef GGML_SYCL_F16
+    // v = v * {d, d};
+    v.s0() *= d;
+    v.s1() *= d;
 #else
     v.x() *= d;
     v.y() *= d;
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 }
 
 //================================== k-quants
@@ -2227,11 +2080,7 @@ template <int vdr>
 static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
                                                     const float &d4,
                                                     const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     int sumi = 0;
-
 #pragma unroll
     for (int i = 0; i < vdr; ++i) {
         const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
@@ -2247,9 +2096,6 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
 
     // second part effectively subtracts 8 from each quant value
     return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 #define VDR_Q4_1_Q8_1_MMVQ 2
@@ -2260,8 +2106,6 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
                                                     const sycl::half2 &dm4,
                                                     const sycl::half2 &ds8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     int sumi = 0;
 
 #pragma unroll
@@ -2274,10 +2118,11 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
         sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
     }
 
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
-    const float d4d8 = tmp.x;
-    const float m4s8 = tmp.y;
+#ifdef GGML_SYCL_F16
+    const sycl::float2 tmp =
+        (dm4 * ds8).convert<float, sycl::rounding_mode::automatic>();
+    const float d4d8 = tmp.x();
+    const float m4s8 = tmp.y();
 #else
     const sycl::float2 dm4f =
         dm4.convert<float, sycl::rounding_mode::automatic>();
@@ -2285,13 +2130,10 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
         ds8.convert<float, sycl::rounding_mode::automatic>();
     const float d4d8 = dm4f.x() * ds8f.x();
     const float m4s8 = dm4f.y() * ds8f.y();
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 
     // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
     return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 #define VDR_Q5_0_Q8_1_MMVQ 2
@@ -2301,9 +2143,6 @@ template <int vdr>
 static __dpct_inline__ float
 vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
                        const float &d5, const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     int sumi = 0;
 
 #pragma unroll
@@ -2330,9 +2169,6 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
 
     // second part effectively subtracts 16 from each quant value
     return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y());
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 #define VDR_Q5_1_Q8_1_MMVQ 2
@@ -2343,8 +2179,6 @@ static __dpct_inline__ float
 vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
                        const sycl::half2 &dm5, const sycl::half2 &ds8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     int sumi = 0;
 
 #pragma unroll
@@ -2366,10 +2200,13 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
                           sumi); // SIMD dot product of quantized values
     }
 
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
-    const float d5d8 = tmp.x;
-    const float m5s8 = tmp.y;
+#ifdef GGML_SYCL_F16
+     const sycl::float2 tmp =
+        (dm5 * ds8).convert<float, sycl::rounding_mode::automatic>();
+    const float d5d8 = tmp.x();
+    const float m5s8 = tmp.y();
+
+
 #else
     const sycl::float2 dm5f =
         dm5.convert<float, sycl::rounding_mode::automatic>();
@@ -2377,14 +2214,10 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
         ds8.convert<float, sycl::rounding_mode::automatic>();
     const float d5d8 = dm5f.x() * ds8f.x();
     const float m5s8 = dm5f.y() * ds8f.y();
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 
     // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
     return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 #define VDR_Q8_0_Q8_1_MMVQ 2
@@ -2395,8 +2228,6 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
                                                     const float &d8_0,
                                                     const float &d8_1) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     int sumi = 0;
 
 #pragma unroll
@@ -2406,9 +2237,6 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
     }
 
     return d8_0*d8_1 * sumi;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 template <int vdr>
@@ -2416,8 +2244,6 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
                                                     const sycl::half2 &dm8,
                                                     const sycl::half2 &ds8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     int sumi = 0;
 
 #pragma unroll
@@ -2426,10 +2252,11 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
         sumi = dpct::dp4a(v[i], u[i], sumi);
     }
 
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
-    const float d8d8 = tmp.x;
-    const float m8s8 = tmp.y;
+#ifdef GGML_SYCL_F16
+    const sycl::float2 tmp =
+        (dm8 * ds8).convert<float, sycl::rounding_mode::automatic>();
+    const float d8d8 = tmp.x();
+    const float m8s8 = tmp.y();
 #else
     const sycl::float2 dm8f =
         dm8.convert<float, sycl::rounding_mode::automatic>();
@@ -2437,13 +2264,10 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
         ds8.convert<float, sycl::rounding_mode::automatic>();
     const float d8d8 = dm8f.x() * ds8f.x();
     const float m8s8 = dm8f.y() * ds8f.y();
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 
     // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
     return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 #define VDR_Q2_K_Q8_1_MMVQ 1
@@ -2454,8 +2278,6 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
     const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
     const sycl::half2 &dm2, const float *__restrict__ d8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -2482,9 +2304,6 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
         dm2.convert<float, sycl::rounding_mode::automatic>();
 
     return dm2f.x() * sumf_d - dm2f.y() * sumf_m;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 // contiguous u/y values
@@ -2493,8 +2312,6 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
                            const uint8_t *__restrict__ scales,
                            const sycl::half2 &dm2, const float &d8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     int sumi_d = 0;
     int sumi_m = 0;
 
@@ -2523,9 +2340,6 @@ vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
         dm2.convert<float, sycl::rounding_mode::automatic>();
 
     return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m);
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 #define VDR_Q3_K_Q8_1_MMVQ 1
@@ -2537,8 +2351,6 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
     const uint8_t *__restrict__ scales, const int &scale_offset,
     const float &d3, const float *__restrict__ d8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     float sumf = 0.0f;
 
 #pragma unroll
@@ -2566,9 +2378,6 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
     }
 
     return d3 * sumf;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 // contiguous u/y values
@@ -2577,8 +2386,6 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
                            const int8_t *__restrict__ scales, const float &d3,
                            const float &d8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     int sumi = 0;
 
 #pragma unroll
@@ -2593,9 +2400,6 @@ vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
     }
 
     return d3*d8 * sumi;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 #define VDR_Q4_K_Q8_1_MMVQ 2
@@ -2607,8 +2411,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
     const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
     const sycl::half2 &dm4, const float *__restrict__ d8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -2632,10 +2434,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
         dm4.convert<float, sycl::rounding_mode::automatic>();
 
     return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 // contiguous u/y values
@@ -2644,8 +2442,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
     const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
     const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -2670,10 +2466,6 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
         dm4.convert<float, sycl::rounding_mode::automatic>();
 
     return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 #define VDR_Q5_K_Q8_1_MMVQ 2
@@ -2686,8 +2478,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
     const uint8_t *__restrict__ m, const sycl::half2 &dm5,
     const float *__restrict__ d8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -2718,10 +2508,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
         dm5.convert<float, sycl::rounding_mode::automatic>();
 
     return dm5f.x() * sumf_d - dm5f.y() * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 // contiguous u/y values
@@ -2730,8 +2516,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
     const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
     const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     float sumf_d = 0.0f;
     float sumf_m = 0.0f;
 
@@ -2756,10 +2540,6 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
         dm4.convert<float, sycl::rounding_mode::automatic>();
 
     return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 #define VDR_Q6_K_Q8_1_MMVQ 1
@@ -2772,8 +2552,6 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
                             const int8_t *__restrict__ scales, const float &d,
                             const float *__restrict__ d8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     float sumf = 0.0f;
 
 #pragma unroll
@@ -2791,9 +2569,6 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
     }
 
     return d*sumf;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 // contiguous u/y values
@@ -2802,8 +2577,6 @@ vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
                            const int8_t *__restrict__ sc, const float &d6,
                            const float *__restrict__ d8) {
 
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
     float sumf_d = 0.0f;
 
 #pragma unroll
@@ -2828,10 +2601,6 @@ vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
     }
 
     return d6 * sumf_d;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
 }
 
 static __dpct_inline__ float
@@ -4345,11 +4114,6 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
 #define NWARPS_Q4_0_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
     mul_mat_q4_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
@@ -4360,24 +4124,8 @@ template <bool need_check> static void
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-    const int nwarps = NWARPS_Q4_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-    const int nwarps = NWARPS_Q4_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
+//sycl_todo: change according to hardware
 
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
     const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
     const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
     const int nwarps = NWARPS_Q4_0_AMPERE;
@@ -4388,20 +4136,6 @@ template <bool need_check> static void
               vec_dot_q4_0_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-    const int nwarps = NWARPS_Q4_0_PASCAL;
-
-    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q4_0_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 #define  MMQ_X_Q4_1_RDNA2  64
@@ -4424,13 +4158,6 @@ template <bool need_check> static void
 #define NWARPS_Q4_1_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
     mul_mat_q4_1(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
@@ -4441,23 +4168,7 @@ template <bool need_check> static void
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-    const int nwarps = NWARPS_Q4_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-    const int nwarps = NWARPS_Q4_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+//sycl_todo: change according to hardware
     const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
     const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
     const int nwarps = NWARPS_Q4_1_AMPERE;
@@ -4468,19 +4179,6 @@ template <bool need_check> static void
               vec_dot_q4_1_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-    const int nwarps = NWARPS_Q4_1_PASCAL;
-    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q4_1_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 #define  MMQ_X_Q5_0_RDNA2  64
@@ -4503,11 +4201,6 @@ template <bool need_check> static void
 #define NWARPS_Q5_0_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
     mul_mat_q5_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
@@ -4518,22 +4211,7 @@ template <bool need_check> static void
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-    const int nwarps = NWARPS_Q5_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-    const int nwarps = NWARPS_Q5_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+//sycl_todo: change according to hardware
     const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
     const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
     const int nwarps = NWARPS_Q5_0_AMPERE;
@@ -4544,19 +4222,6 @@ template <bool need_check> static void
               vec_dot_q5_0_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-    const int nwarps = NWARPS_Q5_0_PASCAL;
-    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q5_0_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 #define  MMQ_X_Q5_1_RDNA2  64
@@ -4579,11 +4244,6 @@ template <bool need_check> static void
 #define NWARPS_Q5_1_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 mul_mat_q5_1(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
@@ -4594,22 +4254,7 @@ mul_mat_q5_1(
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-    const int nwarps = NWARPS_Q5_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-    const int nwarps = NWARPS_Q5_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+//sycl_todo: change according to hardware
     const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
     const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
     const int nwarps = NWARPS_Q5_1_AMPERE;
@@ -4620,19 +4265,6 @@ mul_mat_q5_1(
               vec_dot_q5_1_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-    const int nwarps = NWARPS_Q5_1_PASCAL;
-    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q5_1_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 #define  MMQ_X_Q8_0_RDNA2  64
@@ -4655,11 +4287,6 @@ mul_mat_q5_1(
 #define NWARPS_Q8_0_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
     mul_mat_q8_0(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
@@ -4670,22 +4297,7 @@ template <bool need_check> static void
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-    const int nwarps = NWARPS_Q8_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-    const int nwarps = NWARPS_Q8_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+//sycl_todo: change according to hardware
     const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
     const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
     const int nwarps = NWARPS_Q8_0_AMPERE;
@@ -4696,19 +4308,6 @@ template <bool need_check> static void
               vec_dot_q8_0_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-    const int nwarps = NWARPS_Q8_0_PASCAL;
-    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q8_0_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 #define  MMQ_X_Q2_K_RDNA2  64
@@ -4731,11 +4330,6 @@ template <bool need_check> static void
 #define NWARPS_Q2_K_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 mul_mat_q2_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
@@ -4747,22 +4341,7 @@ mul_mat_q2_K(
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-    const int nwarps = NWARPS_Q2_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-    const int nwarps = NWARPS_Q2_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+//sycl_todo: change according to hardware
     const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
     const int nwarps = NWARPS_Q2_K_AMPERE;
@@ -4773,19 +4352,6 @@ mul_mat_q2_K(
               vec_dot_q2_K_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-    const int nwarps = NWARPS_Q2_K_PASCAL;
-    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q2_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 #define  MMQ_X_Q3_K_RDNA2  128
@@ -4808,14 +4374,7 @@ mul_mat_q2_K(
 #define NWARPS_Q3_K_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q3_K(
+mul_mat_q3_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
     const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K,
@@ -4826,22 +4385,7 @@ template <bool need_check> static void
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-    const int nwarps = NWARPS_Q3_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-    const int nwarps = NWARPS_Q3_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+//sycl_todo: change according to hardware
     const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
     const int nwarps = NWARPS_Q3_K_AMPERE;
@@ -4853,19 +4397,6 @@ template <bool need_check> static void
               vec_dot_q3_K_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-    const int nwarps = NWARPS_Q3_K_PASCAL;
-    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q3_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 #define  MMQ_X_Q4_K_RDNA2  64
@@ -4888,13 +4419,6 @@ template <bool need_check> static void
 #define NWARPS_Q4_K_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
     mul_mat_q4_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
@@ -4906,22 +4430,7 @@ template <bool need_check> static void
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-    const int nwarps = NWARPS_Q4_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-    const int nwarps = NWARPS_Q4_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+//sycl_todo: change according to hardware
     const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
     const int nwarps = NWARPS_Q4_K_AMPERE;
@@ -4932,19 +4441,6 @@ template <bool need_check> static void
               vec_dot_q4_K_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-    const int nwarps = NWARPS_Q4_K_PASCAL;
-    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q4_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 #define  MMQ_X_Q5_K_RDNA2  64
@@ -4967,11 +4463,6 @@ template <bool need_check> static void
 #define NWARPS_Q5_K_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 mul_mat_q5_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
@@ -4983,22 +4474,7 @@ mul_mat_q5_K(
     int   * tile_x_qh = nullptr;
     int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-    const int nwarps = NWARPS_Q5_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-    const int nwarps = NWARPS_Q5_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+//sycl_todo: change according to hardware
     const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
     const int nwarps = NWARPS_Q5_K_AMPERE;
@@ -5009,19 +4485,6 @@ mul_mat_q5_K(
               vec_dot_q5_K_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-    const int nwarps = NWARPS_Q5_K_PASCAL;
-    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q5_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 #define  MMQ_X_Q6_K_RDNA2  64
@@ -5044,13 +4507,6 @@ mul_mat_q5_K(
 #define NWARPS_Q6_K_PASCAL 8
 
 template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
     mul_mat_q6_K(
     const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
     const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
@@ -5061,22 +4517,7 @@ template <bool need_check> static void
     int   * tile_x_qh = nullptr;
     // int   * tile_x_sc = nullptr;
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-    const int nwarps = NWARPS_Q6_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-    const int nwarps = NWARPS_Q6_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
+//sycl_todo: change according to hardware
     const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
     const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
     const int nwarps = NWARPS_Q6_K_AMPERE;
@@ -5087,19 +4528,6 @@ template <bool need_check> static void
               vec_dot_q6_K_q8_1_mul_mat>(
         vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-    const int nwarps = NWARPS_Q6_K_PASCAL;
-    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q6_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
 }
 
 template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
@@ -5167,11 +4595,11 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
     const int y_offset = qr == 1 ? 1 : qk/2;
 
 // partial sum for each thread
-#ifdef GGML_CUDA_F16
-    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
+#ifdef GGML_SYCL_F16
+    sycl::half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
 #else
     float tmp = 0.0f;
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 
     for (int i = 0; i < ncols; i += iter_stride) {
         const int col = i + vals_per_iter*tid;
@@ -5191,15 +4619,15 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
 
             // matrix multiplication
             // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
-#ifdef GGML_CUDA_F16
-            tmp += __hmul2(v, {
-                y[iybs + iqs + j/qr + 0],
-                y[iybs + iqs + j/qr + y_offset]
-            });
+#ifdef GGML_SYCL_F16
+            dfloat2 t1{y[iybs + iqs + j / qr + 0],
+                        y[iybs + iqs + j / qr + y_offset]};
+
+            tmp += v * t1;
 #else
             tmp += v.x() * y[iybs + iqs + j / qr + 0];
             tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
         }
     }
 
@@ -5211,11 +4639,11 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
     }
 
     if (tid == 0) {
-#ifdef GGML_CUDA_F16
-        dst[row] = tmp.x + tmp.y;
+#ifdef GGML_SYCL_F16
+        dst[row] = tmp.x() + tmp.y();
 #else
         dst[row] = tmp;
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
     }
 }
 
@@ -8728,7 +8156,6 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-#if !defined(GGML_USE_HIPBLAS)
 // pool with virtual memory
 /*
 DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.
@@ -8801,10 +8228,7 @@ catch (sycl::exception const &exc) {
             << ", line:" << __LINE__ << std::endl;
   std::exit(1);
 }
-#else
-#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
-#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
-#endif // !defined(GGML_USE_HIPBLAS)
+
 
 template<typename T>
 struct cuda_pool_alloc {
@@ -8924,16 +8348,13 @@ void ggml_init_cublas() try {
 
             g_tensor_split[id] = total_vram;
             total_vram += prop.get_global_mem_size();
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
-#else
             /*
             DPCT1005:87: The SYCL device version is different from CUDA Compute
             Compatibility. You may need to rewrite this code.
             */
             g_device_caps[id].cc =
                 100 * prop.get_major_version() + 10 * prop.get_minor_version();
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
+
             // g_device_caps[id].cc = 9000;
             printf("g_device_caps[%d].cc=%d\n", id, g_device_caps[id].cc);
         }
@@ -9555,29 +8976,6 @@ static int64_t get_row_rounding(ggml_type type) {
         }
     }
 
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
-        case GGML_TYPE_Q3_K:
-            return min_compute_capability < CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#else
     switch(type) {
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
@@ -9599,7 +8997,6 @@ static int64_t get_row_rounding(ggml_type type) {
         default:
             GGML_ASSERT(false);
     }
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
 }
 
 inline void ggml_cuda_op_mul_mat_vec_q(
@@ -9668,9 +9065,9 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
     const int64_t row_diff = row_high - row_low;
 
     // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-#ifdef GGML_CUDA_F16
-    cuda_pool_alloc<half> src1_dfloat_a;
-    half * src1_dfloat = nullptr; // dfloat == half
+#ifdef GGML_SYCL_F16
+    cuda_pool_alloc<sycl::half> src1_dfloat_a;
+    sycl::half *src1_dfloat = nullptr; // dfloat == half
 
     bool src1_convert_f16 =
         src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
@@ -9679,13 +9076,13 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
 
     if (src1_convert_f16) {
         src1_dfloat = src1_dfloat_a.alloc(ne00);
-        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
-                                ne00, 1, sizeof(float), 0, 0,
-                                ne00, 1, sizeof(half),  0, 0, stream);
+        ggml_cpy_f32_f16_cuda((const char *)src1_ddf_i, (char *)src1_dfloat,
+                              ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1,
+                              sizeof(sycl::half), 0, 0, stream);
     }
 #else
     const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
-#endif // GGML_CUDA_F16
+#endif // GGML_SYCL_F16
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
@@ -9761,9 +9158,11 @@ inline void ggml_cuda_op_mul_mat_cublas(
 
     const int compute_capability = g_device_caps[id].cc;
 
+    // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
     if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+
         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp16 path\n");
+        // GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp16 path\n");
         cuda_pool_alloc<sycl::half> src0_as_f16;
         if (src0->type != GGML_TYPE_F16) {
             const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
@@ -9805,7 +9204,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
         to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
     }
     else {
-        GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp32 path\n");
+        // GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp32 path\n");
         cuda_pool_alloc<float> src0_ddq_as_f32;
 
         if (src0->type != GGML_TYPE_F32) {
@@ -11023,7 +10422,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 
             if (use_mul_mat_vec_q) {
                 // NOTE: this kernel does not support ggml_nrows(src1) > 1
-                GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_vec_q path\n");
+                // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_vec_q path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
             } else {
                 // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_dequantize_mul_mat_vec path\n");
@@ -11037,12 +10436,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
             if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
                 use_mul_mat_q = false;
             }
-            // use_mul_mat_q = false;//zjy
+
             if (use_mul_mat_q) {
-                GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_q path\n");
+                // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_q path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
             } else {
-                //GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_cublas path\n");
+                // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_cublas path\n");
                 ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
             }
         }
diff --git a/setup.sh b/setup.sh
index c7e6d47beba15..b5d3809caebd1 100755
--- a/setup.sh
+++ b/setup.sh
@@ -3,5 +3,7 @@ cd build
 source /opt/intel/oneapi/setvars.sh
 
 #cmake .. -DLLAMA_CLBLAST=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 cmake --build . --config Release -v

From da752edaf56a0be654d40773d743659a1c094b6c Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sun, 31 Dec 2023 14:40:51 +0800
Subject: [PATCH 07/90] add GGML_LIST_DEVICE function

---
 CMakeLists.txt |  3 ++-
 README_sycl.md | 51 +++++++++++++++++++++++++++++++++++++++++++++-----
 ggml-sycl.cpp  |  8 +++++++-
 run.sh         |  1 +
 4 files changed, 56 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51089c3b5b742..a2895692d6556 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -474,13 +474,14 @@ if (LLAMA_SYCL)
 
     if (_sycl_support)
 	    add_compile_definitions(GGML_USE_CUBLAS)
+	    #add_compile_definitions(GGML_SYCL_F16)
 	    #add_compile_options(-std=c++17 -O3 -fsycl)
         add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include)
         add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include/sycl)
         add_compile_options(-I/opt/intel/oneapi/dpcpp-ct/2024.0/include)
         add_compile_options(-I/opt/intel/oneapi/2024.0/include)
 
-	    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
+	    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -Wno-narrowing")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
 
diff --git a/README_sycl.md b/README_sycl.md
index e76b9bbb4b85b..83b8a40ccc5ce 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -43,7 +43,17 @@ If you want to get more binary files, please change the build prject.
 
 1. Install Intel oneAPI Base toolkit.
 
-2. Setup Local
+2. Build locally:
+
+```
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake --build . --config Release -v
+```
+
+or
 
 ```
 ./setup.sh
@@ -56,17 +66,22 @@ If you want to get more binary files, please change the build prject.
 Run without parameter:
 
 ```
+export GGML_SYCL_LIST_DEVICE=1
 ./build/bin/main
 ```
 
 Check the id in startup log, like:
-ggml_init_cublas: found 6 CUDA devices:
+
+```
+SYCL devices:
   Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3
   Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2
   Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0
   Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0
   Device 4: Intel(R) UHD Graphics 770, compute capability 3.0
   Device 5: Intel(R) UHD Graphics 770, compute capability 1.3
+Exit for list devices task. unset GGML_SYCL_LIST_DEVICE to restore LLM work!
+```
 
 #### Put model file to folder **models**
 
@@ -133,17 +148,21 @@ Run the command in command line or powershell.
 Run without parameter:
 
 ```
+set GGML_SYCL_LIST_DEVICE=1
 .\x64\Release\llama.cpp.sycl.exe
 ```
-
 Check the id in startup log, like:
-ggml_init_cublas: found 6 CUDA devices:
+
+```
+SYCL devices:
   Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3
   Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2
   Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0
   Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0
   Device 4: Intel(R) UHD Graphics 770, compute capability 3.0
   Device 5: Intel(R) UHD Graphics 770, compute capability 1.3
+Exit for list devices task. unset GGML_SYCL_LIST_DEVICE to restore LLM work!
+```
 
 #### Put model file to folder **models**
 
@@ -160,4 +179,26 @@ set GGML_SYCL_DEVICE=0
 #### Run
 ```
 .\run.bat
-```
\ No newline at end of file
+```
+
+### Environment Variable
+
+#### Build
+
+|Name|Value|Function|
+|-|-|-|
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path|
+|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
+|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
+|GGML_SYCL_F16|OFF (default) or ON|Enable FP16 in computing|
+
+
+#### Running
+
+
+|Name|Value|Function|
+|-|-|-|
+|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. List the device ids by **GGML_SYCL_LIST_DEVICE**|
+|GGML_SYCL_DEBUG|0 (default) or 1|Enable log funciton by macro: GGML_SYCL_DEBUG|
+|GGML_SYCL_LIST_DEVICE|0 (default) or 1|List the device ids only|
+
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 5a92185c03305..16271f2aa6d27 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -8282,7 +8282,6 @@ void print_devices(){
 
 int get_sycl_env(const char* env_name, int default_val){
     char * user_device_string = getenv(env_name);
-    printf("get_sycl_env=%s=%s\n", env_name, user_device_string);
     int user_number = default_val;
 
     unsigned n;
@@ -8297,6 +8296,13 @@ int get_sycl_env(const char* env_name, int default_val){
 void ggml_init_cublas() try {
     static bool initialized = false;
     if (!initialized) {
+        if (get_sycl_env("GGML_SYCL_LIST_DEVICE", 0)!=0){
+            printf("SYCL devices:\n");
+            print_devices();
+            printf("Exit for list devices task. unset GGML_SYCL_LIST_DEVICE to restore LLM work!\n");
+            std::exit(0);
+        }
+
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
 
         printf("g_ggml_sycl_debug=%d\n", g_ggml_sycl_debug);
diff --git a/run.sh b/run.sh
index 6ac5b2a51edac..15714240e2a3e 100755
--- a/run.sh
+++ b/run.sh
@@ -7,6 +7,7 @@ source /opt/intel/oneapi/setvars.sh
 
 export GGML_SYCL_DEVIC=0
 export GGML_SYCL_DEBUG=1
+export GGML_SYCL_LIST_DEVICE=1
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
 

From 6dd32789b4e64a6b7318b1e5e4df751dbe5eaae0 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sun, 31 Dec 2023 15:48:00 +0800
Subject: [PATCH 08/90] step 5 format device and print

---
 ggml-sycl.cpp | 292 ++++++++++++++++++++++----------------------------
 run.sh        |  11 +-
 2 files changed, 138 insertions(+), 165 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 16271f2aa6d27..df26b7f576c2a 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -59,20 +59,10 @@ static int g_ggml_sycl_debug=0;
 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
 
-static const char *cublas_get_error_str(const int err) {
-    /*
-    DPCT1009:48: SYCL uses exceptions to report errors and does not use the
-    error codes. The original code was commented out and a warning string
-    was inserted. You need to rewrite this code.
-    */
-    return "cublasGetStatusString is not supported" /*cublasGetStatusString(err)*/
-        ;
-}
-
-static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
-    fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);
+static void ggml_sycl_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
+    fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
     fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
-    GGML_ASSERT(!"CUDA error");
+    GGML_ASSERT(!"SYCL error");
 }
 
 /*
@@ -86,36 +76,11 @@ DPCT1009:52: SYCL uses exceptions to report errors and does not use the error
 codes. The original code was commented out and a warning string was inserted.
 You need to rewrite this code.
 */
-#define CUDA_CHECK(err) do {                                                   \
-    auto err_ = (err); if (err_ != 0) ggml_cuda_error(                         \
+#define SYCL_CHECK(err) do {                                                   \
+    auto err_ = (err); if (err_ != 0) ggml_sycl_error(                         \
         #err, __func__, __FILE__, __LINE__,                                    \
-        "cudaGetErrorString is not supported" /*cudaGetErrorString(err_)*/);   \
+        "Meet error in this line code!");   \
 } while (0)
-#define CUBLAS_CHECK(err)                                                      \
-    do { auto err_ = (err); if (err_ != 0)                                     \
-             ggml_cuda_error(#err, __func__, __FILE__, __LINE__,               \
-                             cublas_get_error_str(err_)); } while (0)
-
-
-static const char *cu_get_error_str(int err) {
-    const char * err_str;
-    /*
-    DPCT1007:49: Migration of cuGetErrorString is not supported.
-    */
-    // cuGetErrorString(err, &err_str);
-    return err_str;
-}
-/*
-DPCT1001:67: The statement could not be removed.
-*/
-/*
-DPCT1000:68: Error handling if-stmt was detected but could not be rewritten.
-*/
-#define CU_CHECK(err)                                                          \
-    do { auto err_ = (err);                                                    \
-         if (err_ != 0) ggml_cuda_error(#err, __func__, __FILE__, __LINE__,    \
-                                        cu_get_error_str(err_)); } while (0)
-
 
 #if DPCT_COMPAT_RT_VERSION >= 11100
 #define GGML_CUDA_ASSUME(x) __builtin_assume(x)
@@ -401,7 +366,7 @@ struct ggml_tensor_extra_gpu {
 // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
 inline dpct::err0 ggml_cuda_set_device(const int device) try {
     int current_device;
-    CUDA_CHECK(DPCT_CHECK_ERROR(
+    SYCL_CHECK(DPCT_CHECK_ERROR(
         current_device = dpct::dev_mgr::instance().current_device_id()));
 
     if (device == current_device) {
@@ -6420,7 +6385,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -6535,7 +6500,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -6650,7 +6615,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -6765,7 +6730,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -6880,7 +6845,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -6995,7 +6960,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -7118,7 +7083,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
 #if QK_K == 256
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -7246,7 +7211,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -7367,7 +7332,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -7488,7 +7453,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
                                         dpct::queue_ptr stream) try {
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     const int compute_capability = g_device_caps[id].cc;
 
@@ -8073,7 +8038,7 @@ static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
 static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 #ifdef DEBUG_CUDA_MALLOC
     int nnz = 0;
@@ -8115,7 +8080,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
     void * ptr;
     size_t look_ahead_size = (size_t) (1.05 * size);
     look_ahead_size = 256 * ((look_ahead_size + 255)/256);
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(
                              look_ahead_size, dpct::get_in_order_queue())));
     *actual_size = look_ahead_size;
@@ -8135,7 +8100,7 @@ catch (sycl::exception const &exc) {
 static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
@@ -8147,7 +8112,7 @@ static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
         }
     }
     fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
-    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+    SYCL_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
     g_cuda_pool_size[id] -= size;
 }
 catch (sycl::exception const &exc) {
@@ -8179,7 +8144,7 @@ catch (sycl::exception const &exc) {
 static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
 #ifdef DEBUG_CUDA_MALLOC
@@ -8199,7 +8164,7 @@ catch (sycl::exception const &exc) {
 
 static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     if (g_device_caps[id].vmm) {
         return ggml_cuda_pool_malloc_vmm(size, actual_size);
@@ -8215,7 +8180,7 @@ catch (sycl::exception const &exc) {
 
 static void ggml_cuda_pool_free(void *ptr, size_t size) try {
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
     if (g_device_caps[id].vmm) {
         ggml_cuda_pool_free_vmm(ptr, size);
@@ -8270,9 +8235,10 @@ bool ggml_cublas_loaded(void) {
 }
 void print_devices(){
     int device_count = dpct::dev_mgr::instance().device_count();
+    fprintf(stderr, "%s: found %d SYCL devices:\n", __func__, device_count);
     for (int id = 0; id < device_count; ++id) {
         dpct::device_info prop;
-        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(id))));
         fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id,
                 prop.get_name(), prop.get_major_version(),
@@ -8305,7 +8271,7 @@ void ggml_init_cublas() try {
 
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
 
-        printf("g_ggml_sycl_debug=%d\n", g_ggml_sycl_debug);
+        printf("GGML_SYCL_DEBUG=%d\n", g_ggml_sycl_debug);
 
         int user_device_number = get_sycl_env("GGML_SYCL_DEVICE", 0);
 
@@ -8321,17 +8287,19 @@ void ggml_init_cublas() try {
 
         GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
         int64_t total_vram = 0;
-#if defined(GGML_CUDA_FORCE_MMQ)
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
+
+#if defined(GGML_SYCL_FP16)
+        fprintf(stderr, "%s: GGML_SYCL_FP16:   yes\n", __func__);
 #else
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
+        fprintf(stderr, "%s: GGML_SYCL_FP16:   no\n", __func__);
 #endif
+
+
 #if defined(CUDA_USE_TENSOR_CORES)
         fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
 #else
         fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
 #endif
-        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
 
         //zjy hardcode, force set to 1 device
         g_device_count = 1;
@@ -8341,8 +8309,8 @@ void ggml_init_cublas() try {
             g_device_caps[id].vmm = !!device_vmm;
 
             dpct::device_info prop;
-            CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-                prop, dpct::dev_mgr::instance().get_device(id))));
+            SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+                prop, dpct::dev_mgr::instance().get_device(user_device_number))));
             /*
             DPCT1005:86: The SYCL device version is different from CUDA Compute
             Compatibility. You may need to rewrite this code.
@@ -8369,7 +8337,7 @@ void ggml_init_cublas() try {
         }
 
         for (int id = 0; id < g_device_count; ++id) {
-            CUDA_CHECK(ggml_cuda_set_device(user_device_number));
+            SYCL_CHECK(ggml_cuda_set_device(user_device_number));
 
             // create cuda streams
             for (int is = 0; is < MAX_STREAMS; ++is) {
@@ -8377,25 +8345,25 @@ void ggml_init_cublas() try {
                 DPCT1025:88: The SYCL queue is created ignoring the flag and
                 priority options.
                 */
-                CUDA_CHECK(DPCT_CHECK_ERROR(
+                SYCL_CHECK(DPCT_CHECK_ERROR(
                     g_cudaStreams[id][is] =
                         dpct::get_current_device().create_queue()));
             }
 
             // create cublas handle
-            CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
+            SYCL_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
                                               &dpct::get_in_order_queue()));
             /*
             DPCT1027:89: The call to cublasSetMathMode was replaced with 0
             because this functionality is redundant in SYCL.
             */
-            CUBLAS_CHECK(0);
+            SYCL_CHECK(0);
         }
 
         // configure logging to stdout
-        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+        // SYCL_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
         ggml_cuda_set_device(user_device_number);
-        fprintf(stderr, "  set Device %d\n", user_device_number);
+        fprintf(stderr, "Using Device %d\n", user_device_number);
         initialized = true;
         g_cublas_loaded = true;
     }
@@ -8473,7 +8441,7 @@ catch (sycl::exception const &exc) {
 }
 
 void ggml_cuda_host_free(void *ptr) try {
-    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+    SYCL_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -8497,7 +8465,7 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
         kind = dpct::device_to_device;
         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
         int id;
-        CUDA_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(DPCT_CHECK_ERROR(
             id = dpct::dev_mgr::instance().current_device_id()));
         src_ptr = (char *) extra->data_device[id];
     } else {
@@ -8915,7 +8883,7 @@ inline void ggml_cuda_op_mul_mat_q(
     const int64_t row_diff = row_high - row_low;
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
     // the main device has a larger memory buffer to hold the results from all GPUs
@@ -9155,7 +9123,7 @@ inline void ggml_cuda_op_mul_mat_cublas(
     const int64_t row_diff = row_high - row_low;
 
     int id;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
     // the main device has a larger memory buffer to hold the results from all GPUs
@@ -9197,8 +9165,8 @@ inline void ggml_cuda_op_mul_mat_cublas(
         const sycl::half alpha_f16 = 1.0f;
         const sycl::half beta_f16 = 0.0f;
 
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
+        SYCL_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
+        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
             *g_cublas_handles[id], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
             &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
@@ -9224,8 +9192,8 @@ inline void ggml_cuda_op_mul_mat_cublas(
         const float alpha = 1.0f;
         const float beta = 0.0f;
 
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
+        SYCL_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
+        SYCL_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
             *g_cublas_handles[id], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
             dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00,
@@ -9494,7 +9462,7 @@ inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
     DPCT1010:87: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
     */
-    CUDA_CHECK(0);
+    SYCL_CHECK(0);
 
     (void) src1;
     (void) dst;
@@ -9519,7 +9487,7 @@ inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
     DPCT1010:88: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
     */
-    CUDA_CHECK(0);
+    SYCL_CHECK(0);
 
     (void) src1;
     (void) dst;
@@ -9561,7 +9529,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
         src0_ddf = (float *) src0_extra->data_device[g_main_device];
     } else {
         src0_ddf = src0_f.alloc(ggml_nelements(src0));
-        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
+        SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
     }
 
     if (use_src1) {
@@ -9569,7 +9537,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
             src1_ddf = (float *) src1_extra->data_device[g_main_device];
         } else {
             src1_ddf = src1_f.alloc(ggml_nelements(src1));
-            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
+            SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
         }
     }
     if (dst_on_device) {
@@ -9584,16 +9552,16 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
     DPCT1010:89: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
     */
-    CUDA_CHECK(0);
+    SYCL_CHECK(0);
 
     // copy dst to host if necessary
     if (!dst_on_device) {
-        CUDA_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(DPCT_CHECK_ERROR(
             main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
     }
 
     if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(DPCT_CHECK_ERROR(
             dpct::get_current_device().queues_wait_and_throw()));
     }
 }
@@ -9614,12 +9582,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
 
 #ifdef NDEBUG
     for (int id = 0; id < g_device_count; ++id) {
-        CUDA_CHECK(ggml_cuda_set_device(id));
-        // CUDA_CHECK(cudaDeviceSynchronize());
+        SYCL_CHECK(ggml_cuda_set_device(id));
+        // SYCL_CHECK(cudaDeviceSynchronize());
     }
 
     for (int id = 0; id < g_device_count; ++id) {
-        CUDA_CHECK(ggml_cuda_set_device(id));
+        SYCL_CHECK(ggml_cuda_set_device(id));
 
         for (int id_other = 0; id_other < g_device_count; ++id_other) {
             if (id == id_other) {
@@ -9630,12 +9598,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
             }
 
             int can_access_peer;
-            // CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            // SYCL_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
             // if (can_access_peer) {
             //     if (enable_peer_access) {
-            //         CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+            //         SYCL_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
             //     } else {
-            //         CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
+            //         SYCL_CHECK(cudaDeviceDisablePeerAccess(id_other));
             //     }
             // }
         }
@@ -9776,7 +9744,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 use the error codes. The call was replaced with 0. You need to
                 rewrite this code.
                 */
-                CUDA_CHECK(0);
+                SYCL_CHECK(0);
             }
         }
 
@@ -9791,13 +9759,13 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
     // if multiple devices are used they need to wait for the main device
     // here an event is recorded that signals that the main device has finished calculating the input data
     if (split && used_devices > 1) {
-        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        SYCL_CHECK(ggml_cuda_set_device(g_main_device));
         /*
         DPCT1024:91: The original code returned the error code that was further
         consumed by the program logic. This original code was replaced with 0.
         You may need to rewrite the program logic consuming the error code.
         */
-        CUDA_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(DPCT_CHECK_ERROR(
             *src0_extra->events[g_main_device][0] =
                 g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier()));
     }
@@ -9821,7 +9789,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
 
             // wait for main GPU data if necessary
             if (split && (id != g_main_device || is != 0)) {
-                CUDA_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
+                SYCL_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
                     {*src0_extra->events[g_main_device][0]})));
             }
 
@@ -9848,20 +9816,20 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                     if (id != g_main_device) {
                         if (convert_src1_to_q8_1) {
                             char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
-                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                            SYCL_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
                                 src1_ddq_i, src1_ddq_i_source,
                                 src1_ncols * src1_padded_col_size * q8_1_ts /
                                     q8_1_bs)));
                         } else {
                             float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
                             src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
-                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                            SYCL_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
                                 src1_ddf_i, src1_ddf_i_source,
                                 src1_ncols * ne10 * sizeof(float))));
                         }
                     }
                 } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
+                    SYCL_CHECK(ggml_cuda_cpy_tensor_2d(
                                    src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
                 } else {
                     GGML_ASSERT(false);
@@ -9874,11 +9842,11 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                     not use the error codes. The call was replaced with 0. You
                     need to rewrite this code.
                     */
-                    CUDA_CHECK(0);
+                    SYCL_CHECK(0);
                 }
 
                 if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
+                    SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
                 }
 
                 // do the computation
@@ -9889,7 +9857,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 use the error codes. The call was replaced with 0. You need to
                 rewrite this code.
                 */
-                CUDA_CHECK(0);
+                SYCL_CHECK(0);
 
                 // copy dst to host or other device if necessary
                 if (!dst_on_device) {
@@ -9913,7 +9881,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                         dhf_dst_i += src1_col_0*ne0 + row_low[id];
-                        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
+                        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
                             dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
                             row_diff * sizeof(float), row_diff * sizeof(float),
                             src1_ncols, kind, *stream)));
@@ -9921,7 +9889,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                         dhf_dst_i += src1_col_0*ne0;
-                        CUDA_CHECK(DPCT_CHECK_ERROR(
+                        SYCL_CHECK(DPCT_CHECK_ERROR(
                             stream->memcpy(dhf_dst_i, dst_dd_i,
                                            src1_ncols * ne0 * sizeof(float))));
                     }
@@ -9935,7 +9903,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                     code was replaced with 0. You may need to rewrite the
                     program logic consuming the error code.
                     */
-                    CUDA_CHECK(DPCT_CHECK_ERROR(
+                    SYCL_CHECK(DPCT_CHECK_ERROR(
                         *src0_extra->events[id][is] =
                             stream->ext_oneapi_submit_barrier()));
                 }
@@ -9947,7 +9915,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
         if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
             continue;
         }
-        CUDA_CHECK(ggml_cuda_set_device(id));
+        SYCL_CHECK(ggml_cuda_set_device(id));
 
         // free buffers again when done
         if (dst_as[id] > 0) {
@@ -9969,13 +9937,13 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
         int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
         is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
 
-        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+        SYCL_CHECK(ggml_cuda_set_device(g_main_device));
         for (int64_t id = 0; id < g_device_count; ++id) {
             if (row_low[id] == row_high[id]) {
                 continue;
             }
             for (int64_t is = 0; is < is_max; ++is) {
-                CUDA_CHECK(DPCT_CHECK_ERROR(
+                SYCL_CHECK(DPCT_CHECK_ERROR(
                     g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier(
                         {*src0_extra->events[id][is]})));
             }
@@ -9983,8 +9951,8 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
     }
 
     if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-        CUDA_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(ggml_cuda_set_device(g_main_device));
+        SYCL_CHECK(DPCT_CHECK_ERROR(
             dpct::get_current_device().queues_wait_and_throw()));
     }
 }
@@ -10101,7 +10069,7 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
 
     const int64_t ne12 = src1->ne[2];
 
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
     dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -10140,7 +10108,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
 
     const int64_t ne12 = src1->ne[2];
 
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
     dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -10219,10 +10187,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
     const int64_t ne1 = ggml_nelements(src1);
     const int64_t ne  = ggml_nelements(dst);
 
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
     dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
 
-    CUBLAS_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream));
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -10291,7 +10259,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
                 int i03 = i13 / r3;
                 int i02 = i12 / r2;
 
-                CUBLAS_CHECK(
+                SYCL_CHECK(
                         cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
                             ne01, ne11, ne10,
                             alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
@@ -10306,7 +10274,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
     if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
         // there is no broadcast and src0, src1 are contiguous across dims 2, 3
         // use cublasGemmStridedBatchedEx
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
+        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
             *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const char *)src0_as_f16, dpct::library_data_t::real_half,
@@ -10353,9 +10321,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
         error codes. The call was replaced with 0. You need to rewrite this
         code.
         */
-        CUDA_CHECK(0);
+        SYCL_CHECK(0);
 
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
+        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
             *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const void **)(ptrs_src.get() + 0 * ne23),
@@ -10534,10 +10502,10 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
     const int64_t ne1 = ggml_nelements(src1);
     const int64_t ne  = ggml_nelements(dst);
 
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
     cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
 
-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
+    SYCL_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
 
     //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     //void * src0_ddq = src0_extra->data_device[g_main_device];
@@ -10607,9 +10575,9 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
             dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
             dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
     );
-    CUDA_CHECK(cudaGetLastError());
+    SYCL_CHECK(cudaGetLastError());
 
-    CUBLAS_CHECK(
+    SYCL_CHECK(
     cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
             ne01, ne11, ne10,
             &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
@@ -10658,9 +10626,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
     if (ids->backend == GGML_BACKEND_GPU) {
         const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
-        CUDA_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(DPCT_CHECK_ERROR(
             stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
-        CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
+        SYCL_CHECK(DPCT_CHECK_ERROR(stream->wait()));
     } else {
         memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
     }
@@ -10691,8 +10659,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
         for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
             //int32_t row_id;
-            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
-            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+            //SYCL_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+            //SYCL_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
 
             const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
 
@@ -10735,7 +10703,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
                 GGML_ASSERT(row_id >= 0 && row_id < n_as);
 
-                CUDA_CHECK(DPCT_CHECK_ERROR(
+                SYCL_CHECK(DPCT_CHECK_ERROR(
                     stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
                                    src1_original + i01 * nb11, nb11)));
                 num_src1_rows++;
@@ -10768,7 +10736,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
                 GGML_ASSERT(row_id >= 0 && row_id < n_as);
 
-                CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                SYCL_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
                     dst_original + i01 * nb1,
                     dst_contiguous.get() + num_src1_rows * nb1, nb1)));
                 num_src1_rows++;
@@ -10777,7 +10745,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
     }
 
     if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
+        SYCL_CHECK(DPCT_CHECK_ERROR(stream->wait()));
     }
 }
 catch (sycl::exception const &exc) {
@@ -10821,7 +10789,7 @@ static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
     const int64_t nb11 = src1->nb[1];
     const int64_t nb12 = src1->nb[2];
 
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
     dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
 
     const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
@@ -10958,19 +10926,19 @@ void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
         }
 
         char * buf;
-        CUDA_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
+        SYCL_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
                                         size, dpct::get_in_order_queue())));
         char * buf_host = (char *)data + offset_split;
 
         // set padding to 0 to avoid possible NaN values
         if (size > original_size) {
-            CUDA_CHECK(DPCT_CHECK_ERROR(
+            SYCL_CHECK(DPCT_CHECK_ERROR(
                 dpct::get_in_order_queue()
                     .memset(buf + original_size, 0, size - original_size)
                     .wait()));
         }
 
-        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
                                         .memcpy(buf, buf_host, original_size)
                                         .wait()));
 
@@ -10978,7 +10946,7 @@ void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
 
         if (backend == GGML_BACKEND_GPU_SPLIT) {
             for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-                CUDA_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
+                SYCL_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
                                                 new sycl::event()));
             }
         }
@@ -11001,15 +10969,15 @@ void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
 
     for (int64_t id = 0; id < g_device_count; ++id) {
         if (extra->data_device[id] != nullptr) {
-            CUDA_CHECK(ggml_cuda_set_device(id));
-            CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(
+            SYCL_CHECK(ggml_cuda_set_device(id));
+            SYCL_CHECK(DPCT_CHECK_ERROR(sycl::free(
                 extra->data_device[id], dpct::get_in_order_queue())));
         }
 
         for (int64_t is = 0; is < MAX_STREAMS; ++is) {
             if (extra->events[id][is] != nullptr) {
-                CUDA_CHECK(ggml_cuda_set_device(id));
-                CUDA_CHECK(DPCT_CHECK_ERROR(
+                SYCL_CHECK(ggml_cuda_set_device(id));
+                SYCL_CHECK(DPCT_CHECK_ERROR(
                     dpct::destroy_event(extra->events[id][is])));
             }
         }
@@ -11070,7 +11038,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
         force_inplace;
     const size_t size = ggml_nbytes(tensor);
 
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
+    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
         ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
@@ -11093,7 +11061,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
 
         char * data = (char *) g_scratch_buffer;
         if (data == nullptr) {
-            CUDA_CHECK(DPCT_CHECK_ERROR(
+            SYCL_CHECK(DPCT_CHECK_ERROR(
                 data = (char *)sycl::malloc_device(
                     g_scratch_size, dpct::get_in_order_queue())));
             g_scratch_buffer = data;
@@ -11106,9 +11074,9 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
         GGML_ASSERT(g_scratch_offset <= g_scratch_size);
     } else { // allocate new buffers outside of scratch
         void * data;
-        CUDA_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
+        SYCL_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
                                         size, dpct::get_in_order_queue())));
-        CUDA_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(DPCT_CHECK_ERROR(
             dpct::get_in_order_queue().memset(data, 0, size).wait()));
         extra = new ggml_tensor_extra_gpu;
         memset(extra, 0, sizeof(*extra));
@@ -11130,7 +11098,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
     }
     if (g_scratch_buffer == nullptr) {
         ggml_cuda_set_device(g_main_device);
-        CUDA_CHECK(
+        SYCL_CHECK(
             DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
                                  g_scratch_size, dpct::get_in_order_queue())));
     }
@@ -11164,8 +11132,8 @@ void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try {
     GGML_ASSERT(ggml_is_contiguous(tensor));
 
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
+    SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
                                     .memcpy(extra->data_device[g_main_device],
                                             tensor->data, ggml_nbytes(tensor))
                                     .wait()));
@@ -11202,7 +11170,7 @@ void ggml_cuda_set_main_device(const int main_device) try {
     if (g_main_device != main_device && g_device_count > 1) {
         g_main_device = main_device;
         dpct::device_info prop;
-        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(g_main_device))));
         fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
                 g_main_device, prop.get_name());
@@ -11228,7 +11196,7 @@ void ggml_cuda_free_scratch() try {
         return;
     }
 
-    CUDA_CHECK(DPCT_CHECK_ERROR(
+    SYCL_CHECK(DPCT_CHECK_ERROR(
         sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
     g_scratch_buffer = nullptr;
 }
@@ -11413,7 +11381,7 @@ catch (sycl::exception const &exc) {
 void ggml_cuda_get_device_description(int device, char *description,
                                       size_t description_size) try {
     dpct::device_info prop;
-    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+    SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
         prop, dpct::dev_mgr::instance().get_device(device))));
     snprintf(description, description_size, "%s", prop.get_name());
 }
@@ -11460,7 +11428,7 @@ struct ggml_backend_buffer_context_cuda {
 static void
 ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
     delete ctx;
 }
@@ -11503,7 +11471,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
         size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
 
         if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
+            SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
                 (char *)tensor->data + original_size, 0,
                 padded_size - original_size)));
         }
@@ -11526,10 +11494,10 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer,
     ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
 
     ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(dpct::get_in_order_queue()
                              .memcpy((char *)tensor->data + offset, data, size)
                              .wait()));
@@ -11549,10 +11517,10 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer,
     ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
 
     ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
-    CUDA_CHECK(DPCT_CHECK_ERROR(
+    SYCL_CHECK(DPCT_CHECK_ERROR(
         dpct::get_in_order_queue()
             .memcpy(data, (const char *)tensor->data + offset, size)
             .wait()));
@@ -11568,10 +11536,10 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer,
     ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
 
     ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(
+    SYCL_CHECK(
         DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
-    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+    SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
                                     .memset(ctx->dev_ptr, value, buffer->size)
                                     .wait()));
 }
@@ -11604,7 +11572,7 @@ ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
     size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
 
     void * dev_ptr;
-    CUDA_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
+    SYCL_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
                                     size, dpct::get_in_order_queue())));
 
     ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
@@ -11746,7 +11714,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
     GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+    SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
         (char *)tensor->data + offset, data, size)));
 }
 catch (sycl::exception const &exc) {
@@ -11764,7 +11732,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
     GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+    SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
         data, (const char *)tensor->data + offset, size)));
 }
 catch (sycl::exception const &exc) {
@@ -11776,7 +11744,7 @@ catch (sycl::exception const &exc) {
 static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
     ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
 
-    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
+    SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
 
     UNUSED(backend);
 }
diff --git a/run.sh b/run.sh
index 15714240e2a3e..39079dd20f9b7 100755
--- a/run.sh
+++ b/run.sh
@@ -5,9 +5,14 @@ INPUT1="The process of Origami seems simple at the first glance, but in fact, it
 INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh
 
-export GGML_SYCL_DEVIC=0
-export GGML_SYCL_DEBUG=1
-export GGML_SYCL_LIST_DEVICE=1
+if [ $# -gt 0 ]; then
+  export GGML_SYCL_DEVICE=$1
+else
+  export GGML_SYCL_DEVICE=0
+fi
+echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+#export GGML_SYCL_DEBUG=1
+#export GGML_SYCL_LIST_DEVICE=1
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
 

From 3a9d2c54ba9f7a172da2eb5c00c9bd62a243a1d3 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Thu, 4 Jan 2024 14:26:36 +0800
Subject: [PATCH 09/90] step6, enhance error check, remove CUDA macro, enhance
 device id to fix none-zero id issue

---
 CMakeLists.txt |   9 +-
 ggml-sycl.cpp  | 420 ++++++++++++++++++++++++++++---------------------
 ggml-sycl.hpp  |  15 +-
 run.sh         |   2 +-
 setup.sh       |   2 +-
 5 files changed, 258 insertions(+), 190 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a2895692d6556..8488689d91112 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -448,11 +448,11 @@ endif()
 
 if (LLAMA_SYCL)
     set(ENABLE_AOT ats)
-    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for SYCL")
+    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Intel")
+	    message(WARNING "${CMAKE_C_COMPILER_ID} Need IntelLLVM for SYCL")
     endif()
-    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
-        message(WARNING "Only LLVM is supported for SYCL")
+    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Intel")
+        message(WARNING "${CMAKE_CXX_COMPILER_ID} Need IntelLLVM for SYCL")
     endif()
 
     #find_package(SYCL REQUIRED)
@@ -476,6 +476,7 @@ if (LLAMA_SYCL)
 	    add_compile_definitions(GGML_USE_CUBLAS)
 	    #add_compile_definitions(GGML_SYCL_F16)
 	    #add_compile_options(-std=c++17 -O3 -fsycl)
+	add_compile_options(-I./)
         add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include)
         add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include/sycl)
         add_compile_options(-I/opt/intel/oneapi/dpcpp-ct/2024.0/include)
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index df26b7f576c2a..dab9726a70b81 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -17,7 +17,7 @@
 #include <dpct/dpct.hpp>
 #include <dpct/blas_utils.hpp>
 #include <dpct/lib_common_utils.hpp>
-
+#include "ggml-sycl.hpp"
 #include "ggml-cuda.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
@@ -366,7 +366,7 @@ struct ggml_tensor_extra_gpu {
 // probably because the Windows CUDA libraries forget to make this check before invoking the drivers
 inline dpct::err0 ggml_cuda_set_device(const int device) try {
     int current_device;
-    SYCL_CHECK(DPCT_CHECK_ERROR(
+    SYCL_CHECK(CHECK_TRY_ERROR(
         current_device = dpct::dev_mgr::instance().current_device_id()));
 
     if (device == current_device) {
@@ -377,7 +377,7 @@ inline dpct::err0 ggml_cuda_set_device(const int device) try {
     DPCT1093:53: The "device" device may be not the one intended for use. Adjust
     the selected device if needed.
     */
-    return DPCT_CHECK_ERROR(dpct::select_device(device));
+    return CHECK_TRY_ERROR(dpct::select_device(device));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -386,17 +386,26 @@ catch (sycl::exception const &exc) {
 }
 
 static int g_device_count = -1;
+static int g_all_sycl_device_count = -1;
 static int g_main_device = 0;
+static int g_main_device_index = 0;
+
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
 
 struct cuda_device_capabilities {
     int     cc;                 // compute capability
     bool    vmm;                // virtual memory support
     size_t  vmm_granularity;    // granularity of virtual memory
+    int device_id;
 };
 
-static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
+static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0, -1} };
+
+struct sycl_device_id2index {
+    int index;
+};
 
+static sycl_device_id2index g_sycl_device_id2index[GGML_CUDA_MAX_DEVICES] = { {-1} };
 
 static void * g_scratch_buffer = nullptr;
 static size_t g_scratch_size = 0; // disabled by default
@@ -6378,6 +6387,14 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
+int get_device_index_by_id(int id){
+    return g_sycl_device_id2index[id].index;
+}
+
+int get_current_device_index(){
+    return get_device_index_by_id(dpct::dev_mgr::instance().current_device_id());
+}
+
 static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
@@ -6386,7 +6403,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -6501,7 +6518,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -6616,7 +6633,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -6731,7 +6748,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -6846,7 +6863,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -6961,7 +6978,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7084,7 +7101,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7212,7 +7229,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7333,7 +7350,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -7454,7 +7471,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
 
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
@@ -8039,7 +8056,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
 #ifdef DEBUG_CUDA_MALLOC
     int nnz = 0;
     size_t max_size = 0;
@@ -8081,14 +8098,16 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
     size_t look_ahead_size = (size_t) (1.05 * size);
     look_ahead_size = 256 * ((look_ahead_size + 255)/256);
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(
+        CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
                              look_ahead_size, dpct::get_in_order_queue())));
     *actual_size = look_ahead_size;
     g_cuda_pool_size[id] += look_ahead_size;
+
 #ifdef DEBUG_CUDA_MALLOC
     fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
             (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
 #endif
+    GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg return %p\n", ptr);
     return ptr;
 }
 catch (sycl::exception const &exc) {
@@ -8101,7 +8120,7 @@ static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
 
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
         cuda_buffer& b = g_cuda_buffer_pool[id][i];
@@ -8112,7 +8131,7 @@ static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
         }
     }
     fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
-    SYCL_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
     g_cuda_pool_size[id] -= size;
 }
 catch (sycl::exception const &exc) {
@@ -8145,7 +8164,7 @@ static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
 #ifdef DEBUG_CUDA_MALLOC
     printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
@@ -8165,7 +8184,7 @@ catch (sycl::exception const &exc) {
 static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     if (g_device_caps[id].vmm) {
         return ggml_cuda_pool_malloc_vmm(size, actual_size);
     } else {
@@ -8181,7 +8200,7 @@ catch (sycl::exception const &exc) {
 static void ggml_cuda_pool_free(void *ptr, size_t size) try {
     int id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
     if (g_device_caps[id].vmm) {
         ggml_cuda_pool_free_vmm(ptr, size);
     } else {
@@ -8204,6 +8223,7 @@ struct cuda_pool_alloc {
     T * alloc(size_t size) {
         GGML_ASSERT(ptr == nullptr);
         ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size);
+        GGML_SYCL_DEBUG("zjy alloc %lu return %p actual size=%lu\n", size * sizeof(T), ptr, this->actual_size);
         return ptr;
     }
 
@@ -8238,11 +8258,16 @@ void print_devices(){
     fprintf(stderr, "%s: found %d SYCL devices:\n", __func__, device_count);
     for (int id = 0; id < device_count; ++id) {
         dpct::device_info prop;
-        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(id))));
-        fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id,
+        fprintf(stderr, "  Device %d: %s, compute capability %d.%d, max compute_units %d, max work group size %d, max sub group size %d, global mem size %lu\n", id,
                 prop.get_name(), prop.get_major_version(),
-                prop.get_minor_version());
+                prop.get_minor_version(),
+                prop.get_max_compute_units(),
+                prop.get_max_work_group_size(),
+                prop.get_max_sub_group_size(),
+                prop.get_global_mem_size()
+                );
     }
 }
 
@@ -8261,6 +8286,7 @@ int get_sycl_env(const char* env_name, int default_val){
 
 void ggml_init_cublas() try {
     static bool initialized = false;
+
     if (!initialized) {
         if (get_sycl_env("GGML_SYCL_LIST_DEVICE", 0)!=0){
             printf("SYCL devices:\n");
@@ -8277,7 +8303,7 @@ void ggml_init_cublas() try {
 
         print_devices();
 
-        if (DPCT_CHECK_ERROR(g_device_count =
+        if (CHECK_TRY_ERROR(g_all_sycl_device_count =
                                  dpct::dev_mgr::instance().device_count()) !=
             0) {
             initialized = true;
@@ -8285,7 +8311,7 @@ void ggml_init_cublas() try {
             return;
         }
 
-        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_CUDA_MAX_DEVICES);
         int64_t total_vram = 0;
 
 #if defined(GGML_SYCL_FP16)
@@ -8301,16 +8327,20 @@ void ggml_init_cublas() try {
         fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
 #endif
 
-        //zjy hardcode, force set to 1 device
-        g_device_count = 1;
-        for (int id = 0; id < g_device_count; ++id) {
+
+        int device_inx = -1;
+        for (int id = 0; id < g_all_sycl_device_count; ++id) {
+            if(id!=user_device_number) continue;
+            device_inx++;
             int device_vmm = 0;
 
-            g_device_caps[id].vmm = !!device_vmm;
+            g_device_caps[device_inx].vmm = !!device_vmm;
+            g_device_caps[device_inx].device_id = id;
+            g_sycl_device_id2index[id].index = device_inx;
 
             dpct::device_info prop;
-            SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-                prop, dpct::dev_mgr::instance().get_device(user_device_number))));
+            SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
+                prop, dpct::dev_mgr::instance().get_device(id))));
             /*
             DPCT1005:86: The SYCL device version is different from CUDA Compute
             Compatibility. You may need to rewrite this code.
@@ -8320,24 +8350,29 @@ void ggml_init_cublas() try {
                     prop.get_name(), prop.get_major_version(),
                     prop.get_minor_version(), device_vmm ? "yes" : "no");
 
-            g_tensor_split[id] = total_vram;
+            g_tensor_split[device_inx] = total_vram;
             total_vram += prop.get_global_mem_size();
             /*
             DPCT1005:87: The SYCL device version is different from CUDA Compute
             Compatibility. You may need to rewrite this code.
             */
-            g_device_caps[id].cc =
+            g_device_caps[device_inx].cc =
                 100 * prop.get_major_version() + 10 * prop.get_minor_version();
-
-            // g_device_caps[id].cc = 9000;
-            printf("g_device_caps[%d].cc=%d\n", id, g_device_caps[id].cc);
+              // g_device_caps[id].cc = 9000;
+            printf("g_device_caps[%d].cc=%d\n", device_inx, g_device_caps[device_inx].cc);
         }
-        for (int id = 0; id < g_device_count; ++id) {
-            g_tensor_split[id] /= total_vram;
+        device_inx = -1;
+        for (int id = 0; id < g_all_sycl_device_count; ++id) {
+            if(id!=user_device_number) continue;
+            device_inx++;
+            g_tensor_split[device_inx] /= total_vram;
         }
 
-        for (int id = 0; id < g_device_count; ++id) {
-            SYCL_CHECK(ggml_cuda_set_device(user_device_number));
+        device_inx = -1;
+        for (int id = 0; id < g_all_sycl_device_count; ++id) {
+            if(id!=user_device_number) continue;
+            device_inx++;
+            SYCL_CHECK(ggml_cuda_set_device(id));
 
             // create cuda streams
             for (int is = 0; is < MAX_STREAMS; ++is) {
@@ -8345,13 +8380,13 @@ void ggml_init_cublas() try {
                 DPCT1025:88: The SYCL queue is created ignoring the flag and
                 priority options.
                 */
-                SYCL_CHECK(DPCT_CHECK_ERROR(
-                    g_cudaStreams[id][is] =
+                SYCL_CHECK(CHECK_TRY_ERROR(
+                    g_cudaStreams[device_inx][is] =
                         dpct::get_current_device().create_queue()));
             }
 
             // create cublas handle
-            SYCL_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
+            SYCL_CHECK(CHECK_TRY_ERROR(g_cublas_handles[device_inx] =
                                               &dpct::get_in_order_queue()));
             /*
             DPCT1027:89: The call to cublasSetMathMode was replaced with 0
@@ -8362,6 +8397,9 @@ void ggml_init_cublas() try {
 
         // configure logging to stdout
         // SYCL_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+
+        //zjy hardcode, force set to 1 device
+        g_device_count = 1;
         ggml_cuda_set_device(user_device_number);
         fprintf(stderr, "Using Device %d\n", user_device_number);
         initialized = true;
@@ -8405,7 +8443,7 @@ void *ggml_cuda_host_malloc(size_t size) try {
     }
 
     void * ptr = nullptr;
-    dpct::err0 err = DPCT_CHECK_ERROR(
+    dpct::err0 err = CHECK_TRY_ERROR(
         ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
     /*
     DPCT1000:82: Error handling if-stmt was detected but could not be rewritten.
@@ -8441,7 +8479,7 @@ catch (sycl::exception const &exc) {
 }
 
 void ggml_cuda_host_free(void *ptr) try {
-    SYCL_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -8460,15 +8498,18 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
     if (src->backend == GGML_BACKEND_CPU) {
         kind = dpct::host_to_device;
         src_ptr = (char *) src->data;
+        GGML_SYCL_DEBUG("zjy ggml_cuda_cpy_tensor_2d  GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
     } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
         GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
         kind = dpct::device_to_device;
         ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
         int id;
-        SYCL_CHECK(DPCT_CHECK_ERROR(
-            id = dpct::dev_mgr::instance().current_device_id()));
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            id = get_current_device_index()));
+        GGML_SYCL_DEBUG("zjy current device index %d\n", id);
         src_ptr = (char *) extra->data_device[id];
     } else {
+        GGML_SYCL_DEBUG("zjy GGML_ASSERT(false)\n");
         GGML_ASSERT(false);
     }
     char * dst_ptr = (char *) dst;
@@ -8485,9 +8526,13 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
 
     const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
     if (nb0 == ts && nb1 == ts*ne0/bs) {
-        return DPCT_CHECK_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
+        GGML_SYCL_DEBUG("zjy stream->memcpy: dst_ptr=%p, x=%p, size=%lu\n", dst_ptr, x, i1_diff * nb1);
+        // return CHECK_TRY_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
+        return CHECK_TRY_ERROR(dpct::async_dpct_memcpy(dst_ptr, x, i1_diff * nb1,
+                                    kind, *stream));
+
     } else if (nb0 == ts) {
-        return DPCT_CHECK_ERROR(
+        return CHECK_TRY_ERROR(
             dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1,
                                     ts * ne0 / bs, i1_diff, kind, *stream));
     } else {
@@ -8495,7 +8540,7 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
             const void * rx = (const void *) ((const char *) x + i1*nb1);
             void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
             // pretend the row is a matrix with cols=1
-            dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
+            dpct::err0 r = CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
                 rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
             /*
             DPCT1001:85: The statement could not be removed.
@@ -8882,13 +8927,13 @@ inline void ggml_cuda_op_mul_mat_q(
 
     const int64_t row_diff = row_high - row_low;
 
-    int id;
+    int device_id;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(device_id = dpct::dev_mgr::instance().current_device_id()));
 
     // the main device has a larger memory buffer to hold the results from all GPUs
     // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
-    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
@@ -9123,12 +9168,13 @@ inline void ggml_cuda_op_mul_mat_cublas(
     const int64_t row_diff = row_high - row_low;
 
     int id;
+    int device_id = dpct::dev_mgr::instance().current_device_id();
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
+        CHECK_TRY_ERROR(id = get_current_device_index()));
 
     // the main device has a larger memory buffer to hold the results from all GPUs
     // ldc == nrows of the matrix that cuBLAS writes into
-    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
+    int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
 
     const int compute_capability = g_device_caps[id].cc;
 
@@ -9165,8 +9211,8 @@ inline void ggml_cuda_op_mul_mat_cublas(
         const sycl::half alpha_f16 = 1.0f;
         const sycl::half beta_f16 = 0.0f;
 
-        SYCL_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
-        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
+        SYCL_CHECK(CHECK_TRY_ERROR(g_cublas_handles[id] = stream));
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
             *g_cublas_handles[id], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
             &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
@@ -9192,8 +9238,8 @@ inline void ggml_cuda_op_mul_mat_cublas(
         const float alpha = 1.0f;
         const float beta = 0.0f;
 
-        SYCL_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
-        SYCL_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
+        SYCL_CHECK(CHECK_TRY_ERROR(g_cublas_handles[id] = stream));
+        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
             *g_cublas_handles[id], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
             dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00,
@@ -9523,25 +9569,29 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
     cuda_pool_alloc<float>  dst_f;
 
     ggml_cuda_set_device(g_main_device);
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
+    GGML_SYCL_DEBUG("zjy g_main_device_index=%d, src0=%p\n", g_main_device_index, src0);
 
     if (src0_on_device) {
-        src0_ddf = (float *) src0_extra->data_device[g_main_device];
+        src0_ddf = (float *) src0_extra->data_device[g_main_device_index];
     } else {
         src0_ddf = src0_f.alloc(ggml_nelements(src0));
+        GGML_SYCL_DEBUG("zjy g_main_device_index=%d, src0_ddf=%p\n", g_main_device_index, src0_ddf);
+
+        GGML_SYCL_DEBUG("zjy before ggml_cuda_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
         SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
     }
 
     if (use_src1) {
         if (src1_on_device) {
-            src1_ddf = (float *) src1_extra->data_device[g_main_device];
+            src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
         } else {
             src1_ddf = src1_f.alloc(ggml_nelements(src1));
             SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
         }
     }
     if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[g_main_device];
+        dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
     } else {
         dst_ddf = dst_f.alloc(ggml_nelements(dst));
     }
@@ -9556,12 +9606,12 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
 
     // copy dst to host if necessary
     if (!dst_on_device) {
-        SYCL_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(CHECK_TRY_ERROR(
             main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
     }
 
     if (dst->backend == GGML_BACKEND_CPU) {
-        SYCL_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(CHECK_TRY_ERROR(
             dpct::get_current_device().queues_wait_and_throw()));
     }
 }
@@ -9582,18 +9632,20 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
 
 #ifdef NDEBUG
     for (int id = 0; id < g_device_count; ++id) {
-        SYCL_CHECK(ggml_cuda_set_device(id));
+        SYCL_CHECK(ggml_cuda_set_device(g_device_caps[id].device_id));
         // SYCL_CHECK(cudaDeviceSynchronize());
     }
 
     for (int id = 0; id < g_device_count; ++id) {
-        SYCL_CHECK(ggml_cuda_set_device(id));
+        SYCL_CHECK(ggml_cuda_set_device(g_device_caps[id].device_id));
+        int device_id = g_device_caps[id].device_id;
 
         for (int id_other = 0; id_other < g_device_count; ++id_other) {
-            if (id == id_other) {
+            int device_id_other = g_device_caps[id_other].device_id;
+            if (device_id == id_other) {
                 continue;
             }
-            if (id != g_main_device && id_other != g_main_device) {
+            if (device_id != g_main_device && device_id_other != g_main_device) {
                 continue;
             }
 
@@ -9707,16 +9759,16 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
             }
         }
     }
-
     for (int64_t id = 0; id < g_device_count; ++id) {
-        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+
+        if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) {
             continue;
         }
 
         used_devices++;
 
-        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
-        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
+        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
 
         ggml_cuda_set_device(id);
         const dpct::queue_ptr stream = g_cudaStreams[id][0];
@@ -9765,9 +9817,9 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
         consumed by the program logic. This original code was replaced with 0.
         You may need to rewrite the program logic consuming the error code.
         */
-        SYCL_CHECK(DPCT_CHECK_ERROR(
-            *src0_extra->events[g_main_device][0] =
-                g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier()));
+        SYCL_CHECK(CHECK_TRY_ERROR(
+            *src0_extra->events[g_main_device_index][0] =
+                g_cudaStreams[g_main_device_index][0]->ext_oneapi_submit_barrier()));
     }
 
     const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
@@ -9776,21 +9828,21 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
         const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
 
         for (int64_t id = 0; id < g_device_count; ++id) {
-            if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+            if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) {
                 continue;
             }
 
-            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
-            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
+            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
+            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
             const int64_t row_diff = row_high[id] - row_low[id];
 
             ggml_cuda_set_device(id);
             const dpct::queue_ptr stream = g_cudaStreams[id][is];
 
             // wait for main GPU data if necessary
-            if (split && (id != g_main_device || is != 0)) {
-                SYCL_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
-                    {*src0_extra->events[g_main_device][0]})));
+            if (split && (id != g_main_device_index || is != 0)) {
+                SYCL_CHECK(CHECK_TRY_ERROR(stream->ext_oneapi_submit_barrier(
+                    {*src0_extra->events[g_main_device_index][0]})));
             }
 
             for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
@@ -9807,23 +9859,23 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
 
                 // the main device memory buffer can be on VRAM scratch, with space for all partial results
                 // in that case an offset on dst_ddf_i is needed
-                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
+                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device_index) {
                     dst_dd_i += row_low[id]; // offset is 0 if no tensor split
                 }
 
                 // copy src0, src1 to device if necessary
                 if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
-                    if (id != g_main_device) {
+                    if (id != g_main_device_index) {
                         if (convert_src1_to_q8_1) {
-                            char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
-                            SYCL_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                            char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset;
+                            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
                                 src1_ddq_i, src1_ddq_i_source,
                                 src1_ncols * src1_padded_col_size * q8_1_ts /
                                     q8_1_bs)));
                         } else {
-                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
+                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device_index];
                             src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
-                            SYCL_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                            SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
                                 src1_ddf_i, src1_ddf_i_source,
                                 src1_ncols * ne10 * sizeof(float))));
                         }
@@ -9867,7 +9919,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                         dst_off_device = dst->data;
                         kind = dpct::device_to_host;
                     } else if (dst->backend == GGML_BACKEND_GPU) {
-                        dst_off_device = dst_extra->data_device[g_main_device];
+                        dst_off_device = dst_extra->data_device[g_main_device_index];
                         kind = dpct::device_to_device;
                     } else {
                         GGML_ASSERT(false);
@@ -9881,7 +9933,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                         dhf_dst_i += src1_col_0*ne0 + row_low[id];
-                        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
+                        SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy(
                             dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
                             row_diff * sizeof(float), row_diff * sizeof(float),
                             src1_ncols, kind, *stream)));
@@ -9889,21 +9941,21 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                         float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
                         GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
                         dhf_dst_i += src1_col_0*ne0;
-                        SYCL_CHECK(DPCT_CHECK_ERROR(
+                        SYCL_CHECK(CHECK_TRY_ERROR(
                             stream->memcpy(dhf_dst_i, dst_dd_i,
                                            src1_ncols * ne0 * sizeof(float))));
                     }
                 }
 
                 // add event for the main device to wait on until other device is done
-                if (split && (id != g_main_device || is != 0)) {
+                if (split && (id != g_main_device_index || is != 0)) {
                     /*
                     DPCT1024:94: The original code returned the error code that
                     was further consumed by the program logic. This original
                     code was replaced with 0. You may need to rewrite the
                     program logic consuming the error code.
                     */
-                    SYCL_CHECK(DPCT_CHECK_ERROR(
+                    SYCL_CHECK(CHECK_TRY_ERROR(
                         *src0_extra->events[id][is] =
                             stream->ext_oneapi_submit_barrier()));
                 }
@@ -9912,7 +9964,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
     }
 
     for (int64_t id = 0; id < g_device_count; ++id) {
-        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
+        if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) {
             continue;
         }
         SYCL_CHECK(ggml_cuda_set_device(id));
@@ -9943,8 +9995,8 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 continue;
             }
             for (int64_t is = 0; is < is_max; ++is) {
-                SYCL_CHECK(DPCT_CHECK_ERROR(
-                    g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier(
+                SYCL_CHECK(CHECK_TRY_ERROR(
+                    g_cudaStreams[g_main_device_index][0]->ext_oneapi_submit_barrier(
                         {*src0_extra->events[id][is]})));
             }
         }
@@ -9952,7 +10004,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
 
     if (dst->backend == GGML_BACKEND_CPU) {
         SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-        SYCL_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(CHECK_TRY_ERROR(
             dpct::get_current_device().queues_wait_and_throw()));
     }
 }
@@ -10070,16 +10122,16 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
     const int64_t ne12 = src1->ne[2];
 
     SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
+    void * src0_ddq = src0_extra->data_device[g_main_device_index];
 
     ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
 
     ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
 
     ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
 }
@@ -10109,16 +10161,16 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
     const int64_t ne12 = src1->ne[2];
 
     SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
+    void * src0_ddq = src0_extra->data_device[g_main_device_index];
 
     ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
 
     ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
 
     const int64_t row_stride_x = nb01 / sizeof(sycl::half);
     const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
@@ -10188,20 +10240,20 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
     const int64_t ne  = ggml_nelements(dst);
 
     SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
 
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream));
+        CHECK_TRY_ERROR(g_cublas_handles[g_main_device_index] = main_stream));
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
+    void * src0_ddq = src0_extra->data_device[g_main_device_index];
     sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
 
     ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
 
     ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
 
     // convert src1 to fp16
     const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
@@ -10260,7 +10312,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
                 int i02 = i12 / r2;
 
                 SYCL_CHECK(
-                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+                        cublasGemmEx(g_cublas_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N,
                             ne01, ne11, ne10,
                             alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
                                    (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
@@ -10274,8 +10326,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
     if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
         // there is no broadcast and src0, src1 are contiguous across dims 2, 3
         // use cublasGemmStridedBatchedEx
-        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
-            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
+            *g_cublas_handles[g_main_device_index], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const char *)src0_as_f16, dpct::library_data_t::real_half,
             nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
@@ -10323,8 +10375,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
         */
         SYCL_CHECK(0);
 
-        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
-            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
+            *g_cublas_handles[g_main_device_index], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const void **)(ptrs_src.get() + 0 * ne23),
             dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
@@ -10503,19 +10555,19 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
     const int64_t ne  = ggml_nelements(dst);
 
     SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
+    cudaStream_t main_stream = g_cudaStreams[g_main_device_index][0];
 
-    SYCL_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
+    SYCL_CHECK(cublasSetStream(g_cublas_handles[g_main_device_index], main_stream));
 
     //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    //void * src0_ddq = src0_extra->data_device[g_main_device];
+    //void * src0_ddq = src0_extra->data_device[g_main_device_index];
     //half * src0_as_f16 = (half *) src0_ddq;
 
     ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
+    float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
 
     ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
+    float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
 
     // convert src1 to fp16
     const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
@@ -10569,16 +10621,16 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
             r2, r3,
             src00->type, src0_as_f16, src0_ne,
             src1_as_f16, dst_f16,
-            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
-            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
+            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index], id,
+            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device_index] : nullptr,
+            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device_index] : nullptr,
+            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device_index] : nullptr,
+            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device_index] : nullptr
     );
     SYCL_CHECK(cudaGetLastError());
 
     SYCL_CHECK(
-    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
+    cublasGemmBatchedEx(g_cublas_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N,
             ne01, ne11, ne10,
             &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
                         (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
@@ -10622,13 +10674,13 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
     std::vector<char> ids_host(ggml_nbytes(ids));
 
-    const dpct::queue_ptr stream = g_cudaStreams[g_main_device][0];
+    const dpct::queue_ptr stream = g_cudaStreams[g_main_device_index][0];
 
     if (ids->backend == GGML_BACKEND_GPU) {
-        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
-        SYCL_CHECK(DPCT_CHECK_ERROR(
+        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
+        SYCL_CHECK(CHECK_TRY_ERROR(
             stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
-        SYCL_CHECK(DPCT_CHECK_ERROR(stream->wait()));
+        SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
     } else {
         memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
     }
@@ -10649,9 +10701,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
     dst_row.extra = &dst_row_extra;
 
     char * src1_original = src1->backend == GGML_BACKEND_CPU ?
-        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
+        (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index];
     char * dst_original  =  dst->backend == GGML_BACKEND_CPU ?
-        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
+        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device_index];
 
     if (src1->ne[1] == 1) {
         GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
@@ -10668,10 +10720,10 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
             const struct ggml_tensor * src0_row = dst->src[row_id + 2];
 
-            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
+            src1_row_extra.data_device[g_main_device_index] = src1_original + i01*src1->nb[1];
             src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
 
-            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
+            dst_row_extra.data_device[g_main_device_index] = dst_original + i01*dst->nb[1];
             dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
 
             ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
@@ -10680,8 +10732,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
         cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
         cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
 
-        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
-        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();
+        src1_row_extra.data_device[g_main_device_index] = src1_contiguous.get();
+        dst_row_extra.data_device[g_main_device_index]  =  dst_contiguous.get();
 
         const dpct::memcpy_direction src1_kind =
             src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device
@@ -10703,7 +10755,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
                 GGML_ASSERT(row_id >= 0 && row_id < n_as);
 
-                SYCL_CHECK(DPCT_CHECK_ERROR(
+                SYCL_CHECK(CHECK_TRY_ERROR(
                     stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
                                    src1_original + i01 * nb11, nb11)));
                 num_src1_rows++;
@@ -10736,7 +10788,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
                 GGML_ASSERT(row_id >= 0 && row_id < n_as);
 
-                SYCL_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
+                SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(
                     dst_original + i01 * nb1,
                     dst_contiguous.get() + num_src1_rows * nb1, nb1)));
                 num_src1_rows++;
@@ -10745,7 +10797,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
     }
 
     if (dst->backend == GGML_BACKEND_CPU) {
-        SYCL_CHECK(DPCT_CHECK_ERROR(stream->wait()));
+        SYCL_CHECK(CHECK_TRY_ERROR(stream->wait()));
     }
 }
 catch (sycl::exception const &exc) {
@@ -10790,13 +10842,13 @@ static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
     const int64_t nb12 = src1->nb[2];
 
     SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
+    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
 
     const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
 
-    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
+    char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
+    char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index];
 
     if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
         ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
@@ -10885,7 +10937,7 @@ void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
     memset(extra, 0, sizeof(*extra));
 
     for (int64_t id = 0; id < g_device_count; ++id) {
-        if (backend == GGML_BACKEND_GPU && id != g_main_device) {
+        if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
             continue;
         }
 
@@ -10926,19 +10978,19 @@ void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
         }
 
         char * buf;
-        SYCL_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
+        SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
                                         size, dpct::get_in_order_queue())));
         char * buf_host = (char *)data + offset_split;
 
         // set padding to 0 to avoid possible NaN values
         if (size > original_size) {
-            SYCL_CHECK(DPCT_CHECK_ERROR(
+            SYCL_CHECK(CHECK_TRY_ERROR(
                 dpct::get_in_order_queue()
                     .memset(buf + original_size, 0, size - original_size)
                     .wait()));
         }
 
-        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_in_order_queue()
                                         .memcpy(buf, buf_host, original_size)
                                         .wait()));
 
@@ -10946,7 +10998,7 @@ void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
 
         if (backend == GGML_BACKEND_GPU_SPLIT) {
             for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-                SYCL_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
+                SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] =
                                                 new sycl::event()));
             }
         }
@@ -10970,14 +11022,14 @@ void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
     for (int64_t id = 0; id < g_device_count; ++id) {
         if (extra->data_device[id] != nullptr) {
             SYCL_CHECK(ggml_cuda_set_device(id));
-            SYCL_CHECK(DPCT_CHECK_ERROR(sycl::free(
+            SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
                 extra->data_device[id], dpct::get_in_order_queue())));
         }
 
         for (int64_t is = 0; is < MAX_STREAMS; ++is) {
             if (extra->events[id][is] != nullptr) {
                 SYCL_CHECK(ggml_cuda_set_device(id));
-                SYCL_CHECK(DPCT_CHECK_ERROR(
+                SYCL_CHECK(CHECK_TRY_ERROR(
                     dpct::destroy_event(extra->events[id][is])));
             }
         }
@@ -11041,18 +11093,18 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
     SYCL_CHECK(ggml_cuda_set_device(g_main_device));
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
         ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
         size_t offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
             memcpy(&offset, tensor->op_params, sizeof(size_t));
         }
         extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = src0_ddc + offset;
+        extra->data_device[g_main_device_index] = src0_ddc + offset;
     } else if (tensor->op == GGML_OP_CPY) {
         ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
-        void * src1_ddv = src1_extra->data_device[g_main_device];
+        void * src1_ddv = src1_extra->data_device[g_main_device_index];
         extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = src1_ddv;
+        extra->data_device[g_main_device_index] = src1_ddv;
     } else if (scratch) {
         GGML_ASSERT(size <= g_scratch_size);
         if (g_scratch_offset + size > g_scratch_size) {
@@ -11061,26 +11113,26 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
 
         char * data = (char *) g_scratch_buffer;
         if (data == nullptr) {
-            SYCL_CHECK(DPCT_CHECK_ERROR(
+            SYCL_CHECK(CHECK_TRY_ERROR(
                 data = (char *)sycl::malloc_device(
                     g_scratch_size, dpct::get_in_order_queue())));
             g_scratch_buffer = data;
         }
         extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = data + g_scratch_offset;
+        extra->data_device[g_main_device_index] = data + g_scratch_offset;
 
         g_scratch_offset += size;
 
         GGML_ASSERT(g_scratch_offset <= g_scratch_size);
     } else { // allocate new buffers outside of scratch
         void * data;
-        SYCL_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
+        SYCL_CHECK(CHECK_TRY_ERROR(data = (void *)sycl::malloc_device(
                                         size, dpct::get_in_order_queue())));
-        SYCL_CHECK(DPCT_CHECK_ERROR(
+        SYCL_CHECK(CHECK_TRY_ERROR(
             dpct::get_in_order_queue().memset(data, 0, size).wait()));
         extra = new ggml_tensor_extra_gpu;
         memset(extra, 0, sizeof(*extra));
-        extra->data_device[g_main_device] = data;
+        extra->data_device[g_main_device_index] = data;
     }
 
     tensor->extra = extra;
@@ -11099,7 +11151,7 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
     if (g_scratch_buffer == nullptr) {
         ggml_cuda_set_device(g_main_device);
         SYCL_CHECK(
-            DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
+            CHECK_TRY_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
                                  g_scratch_size, dpct::get_in_order_queue())));
     }
 
@@ -11109,14 +11161,14 @@ void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
 
     if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
         ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
+        char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
         size_t view_offset = 0;
         if (tensor->op == GGML_OP_VIEW) {
             memcpy(&view_offset, tensor->op_params, sizeof(size_t));
         }
-        extra->data_device[g_main_device] = src0_ddc + view_offset;
+        extra->data_device[g_main_device_index] = src0_ddc + view_offset;
     } else {
-        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
+        extra->data_device[g_main_device_index] = (char *) g_scratch_buffer + offset;
     }
 
     tensor->extra = extra;
@@ -11133,8 +11185,8 @@ void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try {
 
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
     SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
-                                    .memcpy(extra->data_device[g_main_device],
+    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_in_order_queue()
+                                    .memcpy(extra->data_device[g_main_device_index],
                                             tensor->data, ggml_nbytes(tensor))
                                     .wait()));
 }
@@ -11161,16 +11213,18 @@ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
 }
 
 void ggml_cuda_set_main_device(const int main_device) try {
-    if (main_device >= g_device_count) {
+
+    if (main_device >= g_all_sycl_device_count) {
         fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
-                main_device, g_device_count, g_main_device);
+                main_device, g_all_sycl_device_count, g_main_device);
         return;
     }
 
     if (g_main_device != main_device && g_device_count > 1) {
         g_main_device = main_device;
+        g_main_device_index = get_device_index_by_id(g_main_device);
         dpct::device_info prop;
-        SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(g_main_device))));
         fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
                 g_main_device, prop.get_name());
@@ -11196,7 +11250,7 @@ void ggml_cuda_free_scratch() try {
         return;
     }
 
-    SYCL_CHECK(DPCT_CHECK_ERROR(
+    SYCL_CHECK(CHECK_TRY_ERROR(
         sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
     g_scratch_buffer = nullptr;
 }
@@ -11366,7 +11420,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
 
 int ggml_cuda_get_device_count() try {
     int device_count;
-    if (DPCT_CHECK_ERROR(device_count =
+    if (CHECK_TRY_ERROR(device_count =
                              dpct::dev_mgr::instance().device_count()) != 0) {
         return 0;
     }
@@ -11381,7 +11435,7 @@ catch (sycl::exception const &exc) {
 void ggml_cuda_get_device_description(int device, char *description,
                                       size_t description_size) try {
     dpct::device_info prop;
-    SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
+    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
         prop, dpct::dev_mgr::instance().get_device(device))));
     snprintf(description, description_size, "%s", prop.get_name());
 }
@@ -11429,7 +11483,7 @@ static void
 ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
+        CHECK_TRY_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
     delete ctx;
 }
 catch (sycl::exception const &exc) {
@@ -11471,7 +11525,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
         size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
 
         if (padded_size > original_size && tensor->view_src == nullptr) {
-            SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
+            SYCL_CHECK(CHECK_TRY_ERROR(g_cudaStreams[ctx->device][0]->memset(
                 (char *)tensor->data + original_size, 0,
                 padded_size - original_size)));
         }
@@ -11495,10 +11549,10 @@ static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer,
 
     ggml_cuda_set_device(ctx->device);
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+        CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+        CHECK_TRY_ERROR(dpct::get_in_order_queue()
                              .memcpy((char *)tensor->data + offset, data, size)
                              .wait()));
 }
@@ -11518,9 +11572,9 @@ static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer,
 
     ggml_cuda_set_device(ctx->device);
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+        CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
-    SYCL_CHECK(DPCT_CHECK_ERROR(
+    SYCL_CHECK(CHECK_TRY_ERROR(
         dpct::get_in_order_queue()
             .memcpy(data, (const char *)tensor->data + offset, size)
             .wait()));
@@ -11537,9 +11591,9 @@ static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer,
 
     ggml_cuda_set_device(ctx->device);
     SYCL_CHECK(
-        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
+        CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
-    SYCL_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
+    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_in_order_queue()
                                     .memset(ctx->dev_ptr, value, buffer->size)
                                     .wait()));
 }
@@ -11572,7 +11626,7 @@ ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
     size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
 
     void * dev_ptr;
-    SYCL_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
+    SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
                                     size, dpct::get_in_order_queue())));
 
     ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
@@ -11714,7 +11768,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
     GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+    SYCL_CHECK(CHECK_TRY_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
         (char *)tensor->data + offset, data, size)));
 }
 catch (sycl::exception const &exc) {
@@ -11732,7 +11786,7 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
     GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+    SYCL_CHECK(CHECK_TRY_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
         data, (const char *)tensor->data + offset, size)));
 }
 catch (sycl::exception const &exc) {
@@ -11744,7 +11798,7 @@ catch (sycl::exception const &exc) {
 static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
     ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
 
-    SYCL_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
+    SYCL_CHECK(CHECK_TRY_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
 
     UNUSED(backend);
 }
diff --git a/ggml-sycl.hpp b/ggml-sycl.hpp
index 40710da2e8bc8..252184a460975 100644
--- a/ggml-sycl.hpp
+++ b/ggml-sycl.hpp
@@ -1,4 +1,17 @@
 #include <sycl/sycl.hpp>
 #include <dpct/dpct.hpp>
-typedef half ggml_fp16_t;
+// typedef sycl::half ggml_fp16_t;
 
+#define CHECK_TRY_ERROR(expr)                                                  \
+  [&]() {                                                                      \
+    try {                                                                      \
+      expr;                                                                    \
+      return dpct::success;                                                    \
+    } catch (std::exception const &e) {                                        \
+      std::cerr << e.what()<< "\nException caught at file:" << __FILE__        \
+        << ", line:" << __LINE__ <<", func:"<<__func__<< std::endl;            \
+      return dpct::default_error;                                              \
+    }                                                                          \
+  }()
+
+#define DEBUG_CUDA_MALLOC
\ No newline at end of file
diff --git a/run.sh b/run.sh
index 39079dd20f9b7..de8744b5692d6 100755
--- a/run.sh
+++ b/run.sh
@@ -11,7 +11,7 @@ else
   export GGML_SYCL_DEVICE=0
 fi
 echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
-#export GGML_SYCL_DEBUG=1
+export GGML_SYCL_DEBUG=1
 #export GGML_SYCL_LIST_DEVICE=1
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
diff --git a/setup.sh b/setup.sh
index b5d3809caebd1..0a071c984b270 100755
--- a/setup.sh
+++ b/setup.sh
@@ -6,4 +6,4 @@ source /opt/intel/oneapi/setvars.sh
 
 #cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-cmake --build . --config Release -v
+cmake --build . --config Release --target main

From 65f895d41b4f5201acca1cf2cabb11b2aae8d58e Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Thu, 4 Jan 2024 23:09:56 +0800
Subject: [PATCH 10/90] support main device is non-zero

---
 CMakeLists.txt |  1 +
 ggml-sycl.cpp  | 34 ++++++++++++++++++++++++++++++----
 ggml-sycl.hpp  |  4 +++-
 llama.cpp      | 12 ++++++++++++
 4 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8488689d91112..8054b2a4ff3d7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -474,6 +474,7 @@ if (LLAMA_SYCL)
 
     if (_sycl_support)
 	    add_compile_definitions(GGML_USE_CUBLAS)
+	    add_compile_definitions(GGML_USE_SYCL)
 	    #add_compile_definitions(GGML_SYCL_F16)
 	    #add_compile_options(-std=c++17 -O3 -fsycl)
 	add_compile_options(-I./)
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index dab9726a70b81..8353e544b3d95 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -387,8 +387,8 @@ catch (sycl::exception const &exc) {
 
 static int g_device_count = -1;
 static int g_all_sycl_device_count = -1;
-static int g_main_device = 0;
-static int g_main_device_index = 0;
+static int g_main_device = -1;
+static int g_main_device_index = -1;
 
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
 
@@ -413,6 +413,10 @@ static size_t g_scratch_offset = 0;
 
 static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 
+int get_main_device(){
+    return g_main_device;
+}
+
 [[noreturn]]
 static void bad_arch(const sycl::stream &stream_ct1) {
     stream_ct1 << "ERROR: ggml-cuda was compiled without support for the "
@@ -6388,7 +6392,10 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
 }
 
 int get_device_index_by_id(int id){
-    return g_sycl_device_id2index[id].index;
+    int res = g_sycl_device_id2index[id].index;
+    GGML_SYCL_DEBUG("zjy get_device_index_by_id id=%d device_index=%d\n", id, res);
+    GGML_ASSERT(res>=0);
+    return res;
 }
 
 int get_current_device_index(){
@@ -8057,6 +8064,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_index()));
+    GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg index %d\n", id);
 #ifdef DEBUG_CUDA_MALLOC
     int nnz = 0;
     size_t max_size = 0;
@@ -8080,6 +8088,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
                         *actual_size = b.size;
                         b.ptr = nullptr;
                         b.size = 0;
+                        GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg return 1 %p\n", ptr);
                         return ptr;
                     }
                 }
@@ -8092,6 +8101,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
         *actual_size = b.size;
         b.ptr = nullptr;
         b.size = 0;
+        GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg return 2 %p\n", ptr);
         return ptr;
     }
     void * ptr;
@@ -8327,10 +8337,19 @@ void ggml_init_cublas() try {
         fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
 #endif
 
+        for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
+            g_sycl_device_id2index[id].index = -1;
+            g_device_caps[id].vmm = 0;
+            g_device_caps[id].device_id = -1;
+            g_device_caps[id].cc = 0;
+            g_tensor_split[id] = 0;
+        }
+
 
         int device_inx = -1;
         for (int id = 0; id < g_all_sycl_device_count; ++id) {
             if(id!=user_device_number) continue;
+
             device_inx++;
             int device_vmm = 0;
 
@@ -8400,8 +8419,15 @@ void ggml_init_cublas() try {
 
         //zjy hardcode, force set to 1 device
         g_device_count = 1;
+        ggml_cuda_set_main_device(user_device_number);
         ggml_cuda_set_device(user_device_number);
         fprintf(stderr, "Using Device %d\n", user_device_number);
+
+        // for (int id = 0; id < g_all_sycl_device_count; ++id) {
+        //     GGML_SYCL_DEBUG("zjy id=%d  g_device_caps[%d].device_id=%d g_sycl_device_id2index[%d].index=%d ", id, id,
+        //     g_device_caps[id].device_id, id, g_sycl_device_id2index[id].index);
+        // }
+
         initialized = true;
         g_cublas_loaded = true;
     }
@@ -11220,7 +11246,7 @@ void ggml_cuda_set_main_device(const int main_device) try {
         return;
     }
 
-    if (g_main_device != main_device && g_device_count > 1) {
+    if (g_main_device != main_device && g_device_count >= 1) {
         g_main_device = main_device;
         g_main_device_index = get_device_index_by_id(g_main_device);
         dpct::device_info prop;
diff --git a/ggml-sycl.hpp b/ggml-sycl.hpp
index 252184a460975..2c56cf47dfe8a 100644
--- a/ggml-sycl.hpp
+++ b/ggml-sycl.hpp
@@ -14,4 +14,6 @@
     }                                                                          \
   }()
 
-#define DEBUG_CUDA_MALLOC
\ No newline at end of file
+#define DEBUG_CUDA_MALLOC
+
+int get_main_device();
\ No newline at end of file
diff --git a/llama.cpp b/llama.cpp
index f6f1ec0f403a8..276647be330c9 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13,6 +13,10 @@
 #  include "ggml-opencl.h"
 #endif
 
+#ifdef GGML_USE_SYCL
+#  include "ggml-sycl.hpp"
+#endif
+
 #ifdef GGML_USE_METAL
 #  include "ggml-metal.h"
 #endif
@@ -9785,6 +9789,14 @@ struct llama_model * llama_load_model_from_file(
               struct llama_model_params   params) {
     ggml_time_init();
 
+#ifdef GGML_USE_SYCL
+    int main_device = get_main_device();
+    if(main_device>=0) params.main_gpu = main_device;
+    else {
+        LLAMA_LOG_ERROR("%s: missed to init GPU device\n", __func__);
+        std::exit(1);
+    }
+#endif
     llama_model * model = new llama_model;
 
     unsigned cur_percentage = 0;

From 3b1a743e82e299e5b7dbd84231e22000d4966a4b Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sat, 6 Jan 2024 20:01:29 +0800
Subject: [PATCH 11/90] step7 add debug for code path, rm log

---
 ggml-sycl.cpp | 53 ++++++++++++++++++++++++++++++++++++---------------
 ggml-sycl.hpp |  2 +-
 run.sh        |  2 +-
 3 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 8353e544b3d95..bac7a87082beb 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -5782,6 +5782,7 @@ static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
                               const int nrows, const float eps,
                               dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
+    // printf("%s ncols=%d, nrows=%d, WARP_SIZE=%d\n", __func__, ncols, nrows, WARP_SIZE);
     if (ncols < 1024) {
         const sycl::range<3> block_dims(1, 1, WARP_SIZE);
         stream->submit([&](sycl::handler &cgh) {
@@ -6393,7 +6394,7 @@ static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
 
 int get_device_index_by_id(int id){
     int res = g_sycl_device_id2index[id].index;
-    GGML_SYCL_DEBUG("zjy get_device_index_by_id id=%d device_index=%d\n", id, res);
+    // GGML_SYCL_DEBUG("get_device_index_by_id id=%d device_index=%d\n", id, res);
     GGML_ASSERT(res>=0);
     return res;
 }
@@ -8064,7 +8065,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_index()));
-    GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg index %d\n", id);
+    GGML_SYCL_DEBUG("ggml_cuda_pool_malloc_leg index %d\n", id);
 #ifdef DEBUG_CUDA_MALLOC
     int nnz = 0;
     size_t max_size = 0;
@@ -8088,7 +8089,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
                         *actual_size = b.size;
                         b.ptr = nullptr;
                         b.size = 0;
-                        GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg return 1 %p\n", ptr);
+                        // GGML_SYCL_DEBUG("ggml_cuda_pool_malloc_leg return 1 %p\n", ptr);
                         return ptr;
                     }
                 }
@@ -8101,7 +8102,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
         *actual_size = b.size;
         b.ptr = nullptr;
         b.size = 0;
-        GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg return 2 %p\n", ptr);
+        // GGML_SYCL_DEBUG("ggml_cuda_pool_malloc_leg return 2 %p\n", ptr);
         return ptr;
     }
     void * ptr;
@@ -8117,7 +8118,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
     fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
             (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
 #endif
-    GGML_SYCL_DEBUG("zjy ggml_cuda_pool_malloc_leg return %p\n", ptr);
+    // GGML_SYCL_DEBUG("ggml_cuda_pool_malloc_leg return %p\n", ptr);
     return ptr;
 }
 catch (sycl::exception const &exc) {
@@ -8233,7 +8234,7 @@ struct cuda_pool_alloc {
     T * alloc(size_t size) {
         GGML_ASSERT(ptr == nullptr);
         ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size);
-        GGML_SYCL_DEBUG("zjy alloc %lu return %p actual size=%lu\n", size * sizeof(T), ptr, this->actual_size);
+        GGML_SYCL_DEBUG("alloc %lu return %p actual size=%lu\n", size * sizeof(T), ptr, this->actual_size);
         return ptr;
     }
 
@@ -8417,14 +8418,14 @@ void ggml_init_cublas() try {
         // configure logging to stdout
         // SYCL_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
 
-        //zjy hardcode, force set to 1 device
+        //hardcode, force set to 1 device
         g_device_count = 1;
         ggml_cuda_set_main_device(user_device_number);
         ggml_cuda_set_device(user_device_number);
         fprintf(stderr, "Using Device %d\n", user_device_number);
 
         // for (int id = 0; id < g_all_sycl_device_count; ++id) {
-        //     GGML_SYCL_DEBUG("zjy id=%d  g_device_caps[%d].device_id=%d g_sycl_device_id2index[%d].index=%d ", id, id,
+        //     GGML_SYCL_DEBUG("id=%d  g_device_caps[%d].device_id=%d g_sycl_device_id2index[%d].index=%d ", id, id,
         //     g_device_caps[id].device_id, id, g_sycl_device_id2index[id].index);
         // }
 
@@ -8524,7 +8525,7 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
     if (src->backend == GGML_BACKEND_CPU) {
         kind = dpct::host_to_device;
         src_ptr = (char *) src->data;
-        GGML_SYCL_DEBUG("zjy ggml_cuda_cpy_tensor_2d  GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
+        GGML_SYCL_DEBUG("ggml_cuda_cpy_tensor_2d  GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
     } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
         GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
         kind = dpct::device_to_device;
@@ -8532,10 +8533,10 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
         int id;
         SYCL_CHECK(CHECK_TRY_ERROR(
             id = get_current_device_index()));
-        GGML_SYCL_DEBUG("zjy current device index %d\n", id);
+        GGML_SYCL_DEBUG("current device index %d\n", id);
         src_ptr = (char *) extra->data_device[id];
     } else {
-        GGML_SYCL_DEBUG("zjy GGML_ASSERT(false)\n");
+        GGML_SYCL_DEBUG("GGML_ASSERT(false)\n");
         GGML_ASSERT(false);
     }
     char * dst_ptr = (char *) dst;
@@ -8552,7 +8553,7 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
 
     const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
     if (nb0 == ts && nb1 == ts*ne0/bs) {
-        GGML_SYCL_DEBUG("zjy stream->memcpy: dst_ptr=%p, x=%p, size=%lu\n", dst_ptr, x, i1_diff * nb1);
+        GGML_SYCL_DEBUG("stream->memcpy: dst_ptr=%p, x=%p, size=%lu\n", dst_ptr, x, i1_diff * nb1);
         // return CHECK_TRY_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
         return CHECK_TRY_ERROR(dpct::async_dpct_memcpy(dst_ptr, x, i1_diff * nb1,
                                     kind, *stream));
@@ -9596,15 +9597,15 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
 
     ggml_cuda_set_device(g_main_device);
     dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
-    GGML_SYCL_DEBUG("zjy g_main_device_index=%d, src0=%p\n", g_main_device_index, src0);
+    GGML_SYCL_DEBUG("g_main_device_index=%d, src0=%p\n", g_main_device_index, src0);
 
     if (src0_on_device) {
         src0_ddf = (float *) src0_extra->data_device[g_main_device_index];
     } else {
         src0_ddf = src0_f.alloc(ggml_nelements(src0));
-        GGML_SYCL_DEBUG("zjy g_main_device_index=%d, src0_ddf=%p\n", g_main_device_index, src0_ddf);
+        GGML_SYCL_DEBUG("g_main_device_index=%d, src0_ddf=%p\n", g_main_device_index, src0_ddf);
 
-        GGML_SYCL_DEBUG("zjy before ggml_cuda_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
+        GGML_SYCL_DEBUG("before ggml_cuda_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
         SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
     }
 
@@ -9642,8 +9643,11 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
     }
 }
 catch (sycl::exception const &exc) {
+
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
             << ", line:" << __LINE__ << std::endl;
+  int *foo = (int*)-1;
+  printf("%d\n", *foo);
   std::exit(1);
 }
 
@@ -10041,78 +10045,97 @@ catch (sycl::exception const &exc) {
 }
 
 static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
 }
 
 static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
 }
 
 static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
 }
 
 static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
 }
 
 static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
 }
 
 static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
 }
 
 static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
 }
 
 static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
 }
 
 static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
 }
 
 static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
 }
 
 static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
 }
 
 static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
 }
 
 static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
 }
 
 static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
 }
 
 static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
 }
 
 static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
 }
 
 static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
 }
 
 static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
 }
 
 static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
 }
 
diff --git a/ggml-sycl.hpp b/ggml-sycl.hpp
index 2c56cf47dfe8a..235bbbd8a2a72 100644
--- a/ggml-sycl.hpp
+++ b/ggml-sycl.hpp
@@ -14,6 +14,6 @@
     }                                                                          \
   }()
 
-#define DEBUG_CUDA_MALLOC
+// #define DEBUG_CUDA_MALLOC
 
 int get_main_device();
\ No newline at end of file
diff --git a/run.sh b/run.sh
index de8744b5692d6..39079dd20f9b7 100755
--- a/run.sh
+++ b/run.sh
@@ -11,7 +11,7 @@ else
   export GGML_SYCL_DEVICE=0
 fi
 echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
-export GGML_SYCL_DEBUG=1
+#export GGML_SYCL_DEBUG=1
 #export GGML_SYCL_LIST_DEVICE=1
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33

From c2ef7a9cb97740d9d33d6d2d3c06d81f590fb079 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sun, 7 Jan 2024 16:55:55 +0800
Subject: [PATCH 12/90] step 8, rename all macro & func from cuda by sycl

---
 CMakeLists.txt             |   11 +-
 common/common.cpp          |   34 +-
 examples/server/server.cpp |    6 +-
 ggml-backend.c             |    5 +
 ggml-sycl.cpp              | 1952 ++++++++++++++++++------------------
 ggml-sycl.hpp              |   78 +-
 ggml.c                     |   24 +-
 llama.cpp                  |   17 +-
 llama.h                    |    5 +-
 9 files changed, 1107 insertions(+), 1025 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8054b2a4ff3d7..ce3a75d09b87a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,6 +104,7 @@ option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fas
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
+option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
@@ -448,14 +449,13 @@ endif()
 
 if (LLAMA_SYCL)
     set(ENABLE_AOT ats)
-    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Intel")
+    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM")
 	    message(WARNING "${CMAKE_C_COMPILER_ID} Need IntelLLVM for SYCL")
     endif()
-    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Intel")
+    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "IntelLLVM")
         message(WARNING "${CMAKE_CXX_COMPILER_ID} Need IntelLLVM for SYCL")
     endif()
 
-    #find_package(SYCL REQUIRED)
     find_package(IntelSYCL REQUIRED)
 
     # Check SYCL support by the compiler
@@ -473,10 +473,9 @@ if (LLAMA_SYCL)
     endif()
 
     if (_sycl_support)
-	    add_compile_definitions(GGML_USE_CUBLAS)
-	    add_compile_definitions(GGML_USE_SYCL)
+	    #add_compile_definitions(GGML_USE_CUBLAS)
+	add_compile_definitions(GGML_USE_SYCL)
 	    #add_compile_definitions(GGML_SYCL_F16)
-	    #add_compile_options(-std=c++17 -O3 -fsycl)
 	add_compile_options(-I./)
         add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include)
         add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include/sycl)
diff --git a/common/common.cpp b/common/common.cpp
index 0a7096171f2b5..b26daf2fdcef9 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -42,6 +42,10 @@
 #pragma warning(disable: 4244 4267) // possible loss of data
 #endif
 
+#if (defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL))
+#define GGML_USE_CUBLAS_SYCL
+#endif
+
 int32_t get_num_physical_cores() {
 #ifdef __linux__
     // enumerate the set of thread siblings, num entries is num cores
@@ -601,9 +605,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CLBLAS_SYCL
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
+#endif // GGML_USE_CLBLAS_SYCL
         } else if (arg == "--split-mode" || arg == "-sm") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -620,14 +624,16 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CLBLAS_SYCL
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
+#endif // GGML_USE_CLBLAS_SYCL
+
         } else if (arg == "--tensor-split" || arg == "-ts") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;
             }
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
             std::string arg_next = argv[i];
 
             // split string by , and /
@@ -645,9 +651,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                     params.tensor_split[i] = 0.0f;
                 }
             }
-#ifndef GGML_USE_CUBLAS
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#ifndef GGML_USE_CLBLAS_SYCL
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
+#endif // GGML_USE_CLBLAS_SYCL
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
         } else if (arg == "--numa") {
@@ -1009,6 +1015,16 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
     printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
     printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
+#ifdef GGML_USE_CLBLAS
+    printf("  -nommq, --no-mul-mat-q\n");
+    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
+    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
+#endif // GGML_USE_CLBLAS
+#ifdef GGML_USE_SYCL
+    printf("  -nommq, --no-mul-mat-q\n");
+    printf("                        use " GGML_SYCL_NAME " instead of custom mul_mat_q " GGML_SYCL_NAME " kernels.\n");
+    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
+#endif // GGML_USE_SYCL
 #endif
     printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
     printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0462fbd24739b..bebc211d43ea8 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2319,7 +2319,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 invalid_param = true;
                 break;
             }
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) 
             std::string arg_next = argv[i];
 
             // split string by , and /
@@ -2345,7 +2345,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
         }
         else if (arg == "--no-mul-mat-q" || arg == "-nommq")
         {
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
             params.mul_mat_q = false;
 #else
             LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {});
@@ -2358,7 +2358,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 invalid_param = true;
                 break;
             }
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
             params.main_gpu = std::stoi(argv[i]);
 #else
             LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.", {});
diff --git a/ggml-backend.c b/ggml-backend.c
index 423512defc132..a20bdf4c97505 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -337,6 +337,11 @@ GGML_CALL static void ggml_backend_registry_init(void) {
     ggml_backend_cuda_reg_devices();
 #endif
 
+#ifdef GGML_USE_SYCL
+    extern void ggml_backend_sycl_reg_devices(void);
+    ggml_backend_sycl_reg_devices();
+#endif
+
 #ifdef GGML_USE_METAL
     extern GGML_CALL ggml_backend_t ggml_backend_reg_metal_init(const char * params, void * user_data);
     extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index bac7a87082beb..870676c9631d9 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -11,14 +11,12 @@
 #include <vector>
 #include <cmath>
 
-
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
 #include <dpct/dpct.hpp>
 #include <dpct/blas_utils.hpp>
 #include <dpct/lib_common_utils.hpp>
 #include "ggml-sycl.hpp"
-#include "ggml-cuda.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
 
@@ -26,30 +24,37 @@ static int g_ggml_sycl_debug=0;
 
 #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
 
+#define CHECK_TRY_ERROR(expr)                                                  \
+  [&]() {                                                                      \
+    try {                                                                      \
+      expr;                                                                    \
+      return dpct::success;                                                    \
+    } catch (std::exception const &e) {                                        \
+      std::cerr << e.what()<< "\nException caught at file:" << __FILE__        \
+        << ", line:" << __LINE__ <<", func:"<<__func__<< std::endl;            \
+      return dpct::default_error;                                              \
+    }                                                                          \
+  }()
+
+// #define DEBUG_SYCL_MALLOC
+
+// typedef sycl::half ggml_fp16_t;
+
+#define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
 #define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
 #define CC_VOLTA      700
 #define CC_OFFSET_AMD 1000000
 #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
 
-#define GGML_CUDA_MAX_NODES 8192
-
-// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
-// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
-// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
-// -  7B quantum model: +100-200 MB
-// - 13B quantum model: +200-400 MB
-//
-//#define GGML_CUDA_FORCE_MMQ
-
-// TODO: improve this to be correct for more hardware
-//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
-//       probably other such cases, and not sure what happens on AMD hardware
-#if !defined(GGML_CUDA_FORCE_MMQ)
-#define CUDA_USE_TENSOR_CORES
-#endif
+#define GGML_SYCL_MAX_NODES 8192 //TODO: adapt to hardwares
+
+
+//define for XMX in Intel GPU
+//TODO: currently, it's not used for XMX really.
+#define SYCL_USE_XMX
 
 // max batch size to use MMQ kernels when tensor cores are available
-#define MMQ_MAX_BATCH_SIZE 32
+#define XMX_MAX_BATCH_SIZE 32
 
 
 #if defined(_MSC_VER)
@@ -65,17 +70,6 @@ static void ggml_sycl_error(const char * stmt, const char * func, const char * f
     GGML_ASSERT(!"SYCL error");
 }
 
-/*
-DPCT1001:50: The statement could not be removed.
-*/
-/*
-DPCT1000:51: Error handling if-stmt was detected but could not be rewritten.
-*/
-/*
-DPCT1009:52: SYCL uses exceptions to report errors and does not use the error
-codes. The original code was commented out and a warning string was inserted.
-You need to rewrite this code.
-*/
 #define SYCL_CHECK(err) do {                                                   \
     auto err_ = (err); if (err_ != 0) ggml_sycl_error(                         \
         #err, __func__, __FILE__, __LINE__,                                    \
@@ -83,10 +77,10 @@ You need to rewrite this code.
 } while (0)
 
 #if DPCT_COMPAT_RT_VERSION >= 11100
-#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
+#define GGML_SYCL_ASSUME(x) __builtin_assume(x)
 #else
-#define GGML_CUDA_ASSUME(x)
-#endif // CUDART_VERSION >= 11100
+#define GGML_SYCL_ASSUME(x)
+#endif // DPCT_COMPAT_RT_VERSION >= 11100
 
 #ifdef GGML_SYCL_F16
 typedef sycl::half dfloat; // dequantize float
@@ -129,22 +123,22 @@ static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8,
 }
 
 template <typename T>
-using to_t_cuda_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
+using to_t_sycl_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
                              int k, dpct::queue_ptr stream);
-typedef to_t_cuda_t<float> to_fp32_cuda_t;
-typedef to_t_cuda_t<sycl::half> to_fp16_cuda_t;
+typedef to_t_sycl_t<float> to_fp32_sycl_t;
+typedef to_t_sycl_t<sycl::half> to_fp16_sycl_t;
 
 typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
 typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
 typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
-typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-typedef void (*ggml_cuda_op_mul_mat_t)(
+typedef void (*ggml_sycl_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+typedef void (*ggml_sycl_op_mul_mat_t)(
     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
     const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
     const int64_t src1_ncols, const int64_t src1_padded_row_size,
     const dpct::queue_ptr &stream);
-typedef void (*ggml_cuda_op_flatten_t)(const ggml_tensor *src0,
+typedef void (*ggml_sycl_op_flatten_t)(const ggml_tensor *src0,
                                        const ggml_tensor *src1,
                                        ggml_tensor *dst, const float *src0_dd,
                                        const float *src1_dd, float *dst_dd,
@@ -210,17 +204,17 @@ typedef struct dpct_type_143721 {
 } block_q8_1;
 static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
 
-typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
-typedef void (*allocate_tiles_cuda_t)(int **x_ql, sycl::half2 **x_dm,
+typedef float (*vec_dot_q_sycl_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
+typedef void (*allocate_tiles_sycl_t)(int **x_ql, sycl::half2 **x_dm,
                                       int **x_qh, int **x_sc);
-typedef void (*load_tiles_cuda_t)(const void *__restrict__ vx,
+typedef void (*load_tiles_sycl_t)(const void *__restrict__ vx,
                                   int *__restrict__ x_ql,
                                   sycl::half2 *__restrict__ x_dm,
                                   int *__restrict__ x_qh,
                                   int *__restrict__ x_sc, const int &i_offset,
                                   const int &i_max, const int &k,
                                   const int &blocks_per_row);
-typedef float (*vec_dot_q_mul_mat_cuda_t)(
+typedef float (*vec_dot_q_mul_mat_sycl_t)(
     const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
     const int *__restrict__ x_qh, const int *__restrict__ x_sc,
     const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
@@ -310,33 +304,33 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 #define WARP_SIZE 32
 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
 
-#define CUDA_GELU_BLOCK_SIZE 256
-#define CUDA_SILU_BLOCK_SIZE 256
-#define CUDA_TANH_BLOCK_SIZE 256
-#define CUDA_RELU_BLOCK_SIZE 256
-#define CUDA_SQR_BLOCK_SIZE 256
-#define CUDA_CPY_BLOCK_SIZE 32
-#define CUDA_SCALE_BLOCK_SIZE 256
-#define CUDA_CLAMP_BLOCK_SIZE 256
-#define CUDA_ROPE_BLOCK_SIZE 256
-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
-#define CUDA_ALIBI_BLOCK_SIZE 32
-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
-#define CUDA_QUANTIZE_BLOCK_SIZE 256
-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#define CUDA_GET_ROWS_BLOCK_SIZE 256
-#define CUDA_UPSCALE_BLOCK_SIZE 256
-#define CUDA_CONCAT_BLOCK_SIZE 256
-#define CUDA_PAD_BLOCK_SIZE 256
-#define CUDA_ACC_BLOCK_SIZE 256
-#define CUDA_IM2COL_BLOCK_SIZE 256
+#define SYCL_GELU_BLOCK_SIZE 256
+#define SYCL_SILU_BLOCK_SIZE 256
+#define SYCL_TANH_BLOCK_SIZE 256
+#define SYCL_RELU_BLOCK_SIZE 256
+#define SYCL_SQR_BLOCK_SIZE 256
+#define SYCL_CPY_BLOCK_SIZE 32
+#define SYCL_SCALE_BLOCK_SIZE 256
+#define SYCL_CLAMP_BLOCK_SIZE 256
+#define SYCL_ROPE_BLOCK_SIZE 256
+#define SYCL_SOFT_MAX_BLOCK_SIZE 1024
+#define SYCL_ALIBI_BLOCK_SIZE 32
+#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
+#define SYCL_QUANTIZE_BLOCK_SIZE 256
+#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
+#define SYCL_GET_ROWS_BLOCK_SIZE 256
+#define SYCL_UPSCALE_BLOCK_SIZE 256
+#define SYCL_CONCAT_BLOCK_SIZE 256
+#define SYCL_PAD_BLOCK_SIZE 256
+#define SYCL_ACC_BLOCK_SIZE 256
+#define SYCL_IM2COL_BLOCK_SIZE 256
 
 // dmmv = dequantize_mul_mat_vec
-#ifndef GGML_CUDA_DMMV_X
-#define GGML_CUDA_DMMV_X 32
+#ifndef GGML_SYCL_DMMV_X
+#define GGML_SYCL_DMMV_X 32
 #endif
-#ifndef GGML_CUDA_MMV_Y
-#define GGML_CUDA_MMV_Y 1
+#ifndef GGML_SYCL_MMV_Y
+#define GGML_SYCL_MMV_Y 1
 #endif
 
 #ifndef K_QUANTS_PER_ITERATION
@@ -345,26 +339,24 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_
 static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
 #endif
 
-#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
-#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
-#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
+#ifndef GGML_SYCL_PEER_MAX_BATCH_SIZE
+#define GGML_SYCL_PEER_MAX_BATCH_SIZE 128
+#endif // GGML_SYCL_PEER_MAX_BATCH_SIZE
 
 #define MUL_MAT_SRC1_COL_STRIDE 128
 
 #define MAX_STREAMS 8
-static dpct::queue_ptr g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = {
+static dpct::queue_ptr g_syclStreams[GGML_SYCL_MAX_DEVICES][MAX_STREAMS] = {
     {&dpct::get_in_order_queue()}};
 
 struct ggml_tensor_extra_gpu {
-    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
+    void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split tensors
     dpct::event_ptr
-        events[GGML_CUDA_MAX_DEVICES]
+        events[GGML_SYCL_MAX_DEVICES]
               [MAX_STREAMS]; // events for synchronizing multiple GPUs
 };
 
-// this is faster on Windows
-// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
-inline dpct::err0 ggml_cuda_set_device(const int device) try {
+inline dpct::err0 ggml_sycl_set_device(const int device) try {
     int current_device;
     SYCL_CHECK(CHECK_TRY_ERROR(
         current_device = dpct::dev_mgr::instance().current_device_id()));
@@ -390,28 +382,28 @@ static int g_all_sycl_device_count = -1;
 static int g_main_device = -1;
 static int g_main_device_index = -1;
 
-static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0};
 
-struct cuda_device_capabilities {
+struct sycl_device_capabilities {
     int     cc;                 // compute capability
     bool    vmm;                // virtual memory support
     size_t  vmm_granularity;    // granularity of virtual memory
     int device_id;
 };
 
-static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0, -1} };
+static sycl_device_capabilities g_device_caps[GGML_SYCL_MAX_DEVICES] = { {0, false, 0, -1} };
 
 struct sycl_device_id2index {
     int index;
 };
 
-static sycl_device_id2index g_sycl_device_id2index[GGML_CUDA_MAX_DEVICES] = { {-1} };
+static sycl_device_id2index g_sycl_device_id2index[GGML_SYCL_MAX_DEVICES] = { {-1} };
 
 static void * g_scratch_buffer = nullptr;
 static size_t g_scratch_size = 0; // disabled by default
 static size_t g_scratch_offset = 0;
 
-static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
+static dpct::queue_ptr g_sycl_handles[GGML_SYCL_MAX_DEVICES] = {nullptr};
 
 int get_main_device(){
     return g_main_device;
@@ -419,7 +411,7 @@ int get_main_device(){
 
 [[noreturn]]
 static void bad_arch(const sycl::stream &stream_ct1) {
-    stream_ct1 << "ERROR: ggml-cuda was compiled without support for the "
+    stream_ct1 << "ERROR: ggml-sycl was compiled without support for the "
                   "current GPU architecture.\n";
     // __trap();
     std::exit(1);
@@ -2617,10 +2609,10 @@ load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql,
                 int *__restrict__ x_sc, const int &i_offset, const int &i_max,
                 const int &k, const int &blocks_per_row) {
     (void)x_qh; (void)x_sc;
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI4_0;
     const int kqsx = k % QI4_0;
@@ -2720,10 +2712,10 @@ load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql,
                 const int &k, const int &blocks_per_row) {
     (void)x_qh; (void)x_sc;
 
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI4_1;
     const int kqsx = k % QI4_1;
@@ -2821,10 +2813,10 @@ load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql,
                 const int &k, const int &blocks_per_row) {
     (void)x_qh; (void)x_sc;
 
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI5_0;
     const int kqsx = k % QI5_0;
@@ -2946,10 +2938,10 @@ load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql,
                 const int &k, const int &blocks_per_row) {
     (void)x_qh; (void)x_sc;
 
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset < nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset < nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI5_1;
     const int kqsx = k % QI5_1;
@@ -3062,10 +3054,10 @@ load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql,
                 const int &k, const int &blocks_per_row) {
     (void)x_qh; (void)x_sc;
 
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI8_0;
     const int kqsx = k % QI8_0;
@@ -3162,10 +3154,10 @@ load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql,
                 const int &k, const int &blocks_per_row) {
     (void)x_qh;
 
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI2_K;
     const int kqsx = k % QI2_K;
@@ -3289,10 +3281,10 @@ load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql,
                 int *__restrict__ x_sc, const int &i_offset, const int &i_max,
                 const int &k, const int &blocks_per_row) {
 
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI3_K;
     const int kqsx = k % QI3_K;
@@ -3450,7 +3442,7 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
 
 #else
 
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+#if __SYCL_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     const block_q4_K * bq4_K = (const block_q4_K *) vbq;
 
     float sumf_d = 0.0f;
@@ -3490,7 +3482,7 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
 
 #else
     bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+#endif // __SYCL_ARCH__ >= MIN_CC_DP4A
 
 #endif
 }
@@ -3515,10 +3507,10 @@ load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
                 const int &k, const int &blocks_per_row) {
     (void)x_qh;
 
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI4_K; // == 0 if QK_K == 256
     const int kqsx = k % QI4_K; // == k if QK_K == 256
@@ -3643,7 +3635,7 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
 
 #else
 
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+#if __SYCL_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
     const block_q5_K * bq5_K = (const block_q5_K *) vbq;
 
     const int8_t * s = bq5_K->scales;
@@ -3679,7 +3671,7 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
 
 #else
     bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
+#endif // __SYCL_ARCH__ >= MIN_CC_DP4A
 
 #endif
 }
@@ -3704,10 +3696,10 @@ load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
                 const int &k, const int &blocks_per_row) {
     (void)x_qh;
 
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI5_K; // == 0 if QK_K == 256
     const int kqsx = k % QI5_K; // == k if QK_K == 256
@@ -3841,10 +3833,10 @@ load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
                 const int &k, const int &blocks_per_row) {
     (void)x_qh;
 
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
+    GGML_SYCL_ASSUME(i_offset >= 0);
+    GGML_SYCL_ASSUME(i_offset <  nwarps);
+    GGML_SYCL_ASSUME(k >= 0);
+    GGML_SYCL_ASSUME(k <  WARP_SIZE);
 
     const int kbx  = k / QI6_K; // == 0 if QK_K == 256
     const int kqsx = k % QI6_K; // == k if QK_K == 256
@@ -3930,8 +3922,8 @@ static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
 }
 
 template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
-          int mmq_y, int nwarps, load_tiles_cuda_t load_tiles, int vdr,
-          vec_dot_q_mul_mat_cuda_t vec_dot>
+          int mmq_y, int nwarps, load_tiles_sycl_t load_tiles, int vdr,
+          vec_dot_q_mul_mat_sycl_t vec_dot>
 /*
 DPCT1110:8: The total declared local variable size in device function mul_mat_q
 exceeds 128 bytes and may cause high register pressure. Consult with your
@@ -4078,7 +4070,7 @@ mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
 #define  MMQ_X_Q4_0_RDNA1  64
 #define  MMQ_Y_Q4_0_RDNA1  64
 #define NWARPS_Q4_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q4_0_AMPERE 4
 #define  MMQ_Y_Q4_0_AMPERE 32
 #define NWARPS_Q4_0_AMPERE 4
@@ -4122,7 +4114,7 @@ template <bool need_check> static void
 #define  MMQ_X_Q4_1_RDNA1  64
 #define  MMQ_Y_Q4_1_RDNA1  64
 #define NWARPS_Q4_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q4_1_AMPERE 4
 #define  MMQ_Y_Q4_1_AMPERE 32
 #define NWARPS_Q4_1_AMPERE 4
@@ -4165,7 +4157,7 @@ template <bool need_check> static void
 #define  MMQ_X_Q5_0_RDNA1  64
 #define  MMQ_Y_Q5_0_RDNA1  64
 #define NWARPS_Q5_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q5_0_AMPERE 4
 #define  MMQ_Y_Q5_0_AMPERE 32
 #define NWARPS_Q5_0_AMPERE 4
@@ -4208,7 +4200,7 @@ template <bool need_check> static void
 #define  MMQ_X_Q5_1_RDNA1  64
 #define  MMQ_Y_Q5_1_RDNA1  64
 #define NWARPS_Q5_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q5_1_AMPERE 4
 #define  MMQ_Y_Q5_1_AMPERE 32
 #define NWARPS_Q5_1_AMPERE 4
@@ -4251,7 +4243,7 @@ mul_mat_q5_1(
 #define  MMQ_X_Q8_0_RDNA1  64
 #define  MMQ_Y_Q8_0_RDNA1  64
 #define NWARPS_Q8_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q8_0_AMPERE 4
 #define  MMQ_Y_Q8_0_AMPERE 32
 #define NWARPS_Q8_0_AMPERE 4
@@ -4294,7 +4286,7 @@ template <bool need_check> static void
 #define  MMQ_X_Q2_K_RDNA1  128
 #define  MMQ_Y_Q2_K_RDNA1  32
 #define NWARPS_Q2_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q2_K_AMPERE 4
 #define  MMQ_Y_Q2_K_AMPERE 32
 #define NWARPS_Q2_K_AMPERE 4
@@ -4338,7 +4330,7 @@ mul_mat_q2_K(
 #define  MMQ_X_Q3_K_RDNA1  32
 #define  MMQ_Y_Q3_K_RDNA1  128
 #define NWARPS_Q3_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q3_K_AMPERE 4
 #define  MMQ_Y_Q3_K_AMPERE 32
 #define NWARPS_Q3_K_AMPERE 4
@@ -4383,7 +4375,7 @@ mul_mat_q3_K(
 #define  MMQ_X_Q4_K_RDNA1  32
 #define  MMQ_Y_Q4_K_RDNA1  64
 #define NWARPS_Q4_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q4_K_AMPERE 4
 #define  MMQ_Y_Q4_K_AMPERE 32
 #define NWARPS_Q4_K_AMPERE 4
@@ -4427,7 +4419,7 @@ template <bool need_check> static void
 #define  MMQ_X_Q5_K_RDNA1  32
 #define  MMQ_Y_Q5_K_RDNA1  64
 #define NWARPS_Q5_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q5_K_AMPERE 4
 #define  MMQ_Y_Q5_K_AMPERE 32
 #define NWARPS_Q5_K_AMPERE 4
@@ -4471,7 +4463,7 @@ mul_mat_q5_K(
 #define  MMQ_X_Q6_K_RDNA1  32
 #define  MMQ_Y_Q6_K_RDNA1  64
 #define NWARPS_Q6_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
+#if defined(SYCL_USE_XMX)
 #define  MMQ_X_Q6_K_AMPERE 4
 #define  MMQ_Y_Q6_K_AMPERE 32
 #define NWARPS_Q6_K_AMPERE 4
@@ -4508,7 +4500,7 @@ template <bool need_check> static void
         tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
 }
 
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
+template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_sycl_t vec_dot_q_sycl>
 static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
                           const sycl::nd_item<3> &item_ct1) {
     const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
@@ -4539,7 +4531,7 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
             (item_ct1.get_local_id(2) %
              (qi / vdr)); // x block quant index when casting the quants to int
 
-        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
+        tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
     }
 
     // sum up partial sums and write back result
@@ -4568,7 +4560,7 @@ static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat *
 
     const int tid = item_ct1.get_local_id(2);
 
-    const int iter_stride = 2*GGML_CUDA_DMMV_X;
+    const int iter_stride = 2*GGML_SYCL_DMMV_X;
     const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
     const int y_offset = qr == 1 ? 1 : qk/2;
 
@@ -5334,15 +5326,15 @@ static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
 }
 
 template <int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_cuda(const ggml_tensor *src0, const ggml_tensor *src1,
+static void get_rows_sycl(const ggml_tensor *src0, const ggml_tensor *src1,
                           ggml_tensor *dst, const void *src0_dd,
                           const int32_t *src1_dd, float *dst_dd,
                           dpct::queue_ptr stream) {
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
-    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + 2*SYCL_GET_ROWS_BLOCK_SIZE - 1) / (2*SYCL_GET_ROWS_BLOCK_SIZE);
     const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
 
     // strides in elements
@@ -5369,15 +5361,15 @@ static void get_rows_cuda(const ggml_tensor *src0, const ggml_tensor *src1,
 }
 
 template <typename src0_t>
-static void get_rows_cuda_float(const ggml_tensor *src0,
+static void get_rows_sycl_float(const ggml_tensor *src0,
                                 const ggml_tensor *src1, ggml_tensor *dst,
                                 const src0_t *src0_dd, const int32_t *src1_dd,
                                 float *dst_dd, dpct::queue_ptr stream) {
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
-    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
+    const sycl::range<3> block_dims(1, 1, SYCL_GET_ROWS_BLOCK_SIZE);
+    const int block_num_x = (ne00 + SYCL_GET_ROWS_BLOCK_SIZE - 1) / SYCL_GET_ROWS_BLOCK_SIZE;
     const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
 
     // strides in elements
@@ -5407,7 +5399,7 @@ static void get_rows_cuda_float(const ggml_tensor *src0,
 }
 
 template<float (*bin_op)(const float, const float)>
-struct bin_bcast_cuda {
+struct bin_bcast_sycl {
     template <typename src0_t, typename src1_t, typename dst_t>
     void operator()(const struct ggml_tensor *src0,
                     const struct ggml_tensor *src1, struct ggml_tensor *dst,
@@ -5546,107 +5538,107 @@ struct bin_bcast_cuda {
     }
 };
 
-static void acc_f32_cuda(const float *x, const float *y, float *dst,
+static void acc_f32_sycl(const float *x, const float *y, float *dst,
                          const int n_elements, const int ne10, const int ne11,
                          const int ne12, const int nb1, const int nb2,
                          const int offset, dpct::queue_ptr stream) {
-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
+    int num_blocks = (n_elements + SYCL_ACC_BLOCK_SIZE - 1) / SYCL_ACC_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_ACC_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
                     item_ct1);
         });
 }
 
-static void gelu_f32_cuda(const float *x, float *dst, const int k,
+static void gelu_f32_sycl(const float *x, float *dst, const int k,
                           dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             gelu_f32(x, dst, k, item_ct1);
         });
 }
 
-static void silu_f32_cuda(const float *x, float *dst, const int k,
+static void silu_f32_sycl(const float *x, float *dst, const int k,
                           dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_SILU_BLOCK_SIZE - 1) / SYCL_SILU_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SILU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             silu_f32(x, dst, k, item_ct1);
         });
 }
 
-static void gelu_quick_f32_cuda(const float *x, float *dst, const int k,
+static void gelu_quick_f32_sycl(const float *x, float *dst, const int k,
                                 dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_GELU_BLOCK_SIZE - 1) / SYCL_GELU_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_GELU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             gelu_quick_f32(x, dst, k, item_ct1);
         });
 }
 
-static void tanh_f32_cuda(const float *x, float *dst, const int k,
+static void tanh_f32_sycl(const float *x, float *dst, const int k,
                           dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_TANH_BLOCK_SIZE - 1) / SYCL_TANH_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_TANH_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             tanh_f32(x, dst, k, item_ct1);
         });
 }
 
-static void relu_f32_cuda(const float *x, float *dst, const int k,
+static void relu_f32_sycl(const float *x, float *dst, const int k,
                           dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             relu_f32(x, dst, k, item_ct1);
         });
 }
 
-static void leaky_relu_f32_cuda(const float *x, float *dst, const int k,
+static void leaky_relu_f32_sycl(const float *x, float *dst, const int k,
                                 const float negative_slope,
                                 dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_RELU_BLOCK_SIZE - 1) / SYCL_RELU_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_RELU_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
         });
 }
 
-static void sqr_f32_cuda(const float *x, float *dst, const int k,
+static void sqr_f32_sycl(const float *x, float *dst, const int k,
                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_SQR_BLOCK_SIZE - 1) / SYCL_SQR_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SQR_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             sqr_f32(x, dst, k, item_ct1);
         });
 }
 
-static void norm_f32_cuda(const float *x, float *dst, const int ncols,
+static void norm_f32_sycl(const float *x, float *dst, const int ncols,
                           const int nrows, const float eps,
                           dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
@@ -5688,7 +5680,7 @@ static void norm_f32_cuda(const float *x, float *dst, const int ncols,
     }
 }
 
-static void group_norm_f32_cuda(const float *x, float *dst,
+static void group_norm_f32_sycl(const float *x, float *dst,
                                 const int num_groups, const int group_size,
                                 const int ne_elements, dpct::queue_ptr stream) {
     static const float eps = 1e-6f;
@@ -5736,49 +5728,49 @@ static void group_norm_f32_cuda(const float *x, float *dst,
     }
 }
 
-static void concat_f32_cuda(const float *x, const float *y, float *dst,
+static void concat_f32_sycl(const float *x, const float *y, float *dst,
                             const int ne0, int ne1, int ne2, int ne02,
                             dpct::queue_ptr stream) {
-    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
+    int num_blocks = (ne0 + SYCL_CONCAT_BLOCK_SIZE - 1) / SYCL_CONCAT_BLOCK_SIZE;
     sycl::range<3> gridDim(ne2, ne1, num_blocks);
     stream->parallel_for(
         sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CONCAT_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             concat_f32(x, y, dst, ne0, ne02, item_ct1);
         });
 }
 
-static void upscale_f32_cuda(const float *x, float *dst, const int ne00,
+static void upscale_f32_sycl(const float *x, float *dst, const int ne00,
                              const int ne01, const int ne02,
                              const int scale_factor, dpct::queue_ptr stream) {
     int ne0 = (ne00 * scale_factor);
-    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
+    int num_blocks = (ne0 + SYCL_UPSCALE_BLOCK_SIZE - 1) / SYCL_UPSCALE_BLOCK_SIZE;
     sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
     stream->parallel_for(
         sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_UPSCALE_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
         });
 }
 
-static void pad_f32_cuda(const float *x, float *dst, const int ne00,
+static void pad_f32_sycl(const float *x, float *dst, const int ne00,
                          const int ne01, const int ne02, const int ne0,
                          const int ne1, const int ne2, dpct::queue_ptr stream) {
-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
+    int num_blocks = (ne0 + SYCL_PAD_BLOCK_SIZE - 1) / SYCL_PAD_BLOCK_SIZE;
     sycl::range<3> gridDim(ne2, ne1, num_blocks);
     stream->parallel_for(
-        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)),
+        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_PAD_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
         });
 }
 
-static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
+static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
                               const int nrows, const float eps,
                               dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % WARP_SIZE == 0);
@@ -5821,12 +5813,12 @@ static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
     }
 }
 
-static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx,
+static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,
                                    const int ky, const int kx_padded,
                                    dpct::queue_ptr stream) {
-    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const int block_num_x = (kx_padded + SYCL_QUANTIZE_BLOCK_SIZE - 1) / SYCL_QUANTIZE_BLOCK_SIZE;
     const sycl::range<3> num_blocks(1, ky, block_num_x);
-    const sycl::range<3> block_size(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE);
+    const sycl::range<3> block_size(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE);
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
@@ -5840,10 +5832,10 @@ static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx,
 }
 
 template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block_cuda(const void *__restrict__ vx,
+static void dequantize_block_sycl(const void *__restrict__ vx,
                                   dst_t *__restrict__ y, const int k,
                                   dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE;
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
@@ -5851,8 +5843,8 @@ static void dequantize_block_cuda(const void *__restrict__ vx,
         stream->parallel_for(
             sycl::nd_range<3>(
                 sycl::range<3>(1, 1, num_blocks) *
-                    sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE),
-                sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)),
+                    sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE),
+                sycl::range<3>(1, 1, SYCL_DEQUANTIZE_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
                 dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
             });
@@ -5860,7 +5852,7 @@ static void dequantize_block_cuda(const void *__restrict__ vx,
 }
 
 template <typename dst_t>
-static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k,
+static void dequantize_row_q2_K_sycl(const void *vx, dst_t *y, const int k,
                                      dpct::queue_ptr stream) {
     const int nb = k / QK_K;
 #if QK_K == 256
@@ -5881,7 +5873,7 @@ static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k,
 }
 
 template <typename dst_t>
-static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k,
+static void dequantize_row_q3_K_sycl(const void *vx, dst_t *y, const int k,
                                      dpct::queue_ptr stream) {
     const int nb = k / QK_K;
 #if QK_K == 256
@@ -5902,7 +5894,7 @@ static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k,
 }
 
 template <typename dst_t>
-static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k,
+static void dequantize_row_q4_K_sycl(const void *vx, dst_t *y, const int k,
                                      dpct::queue_ptr stream) {
     const int nb = k / QK_K;
     {
@@ -5919,7 +5911,7 @@ static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k,
 }
 
 template <typename dst_t>
-static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k,
+static void dequantize_row_q5_K_sycl(const void *vx, dst_t *y, const int k,
                                      dpct::queue_ptr stream) {
     const int nb = k / QK_K;
 #if QK_K == 256
@@ -5940,7 +5932,7 @@ static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k,
 }
 
 template <typename dst_t>
-static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k,
+static void dequantize_row_q6_K_sycl(const void *vx, dst_t *y, const int k,
                                      dpct::queue_ptr stream) {
     const int nb = k / QK_K;
 #if QK_K == 256
@@ -5960,73 +5952,73 @@ static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k,
 #endif
 }
 
-static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
+static to_fp16_sycl_t ggml_get_to_fp16_sycl(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+            return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
         case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+            return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
         case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+            return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+            return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
         case GGML_TYPE_Q8_0:
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+            return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
         case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
+            return dequantize_row_q2_K_sycl;
         case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
+            return dequantize_row_q3_K_sycl;
         case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
+            return dequantize_row_q4_K_sycl;
         case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
+            return dequantize_row_q5_K_sycl;
         case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
+            return dequantize_row_q6_K_sycl;
         case GGML_TYPE_F32:
-            return dequantize_block_cuda<1, 1, convert_f32>;
+            return dequantize_block_sycl<1, 1, convert_f32>;
         default:
             return nullptr;
     }
 }
 
-static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+static to_fp32_sycl_t ggml_get_to_fp32_sycl(ggml_type type) {
     switch (type) {
         case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
+            return dequantize_block_sycl<QK4_0, QR4_0, dequantize_q4_0>;
         case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
+            return dequantize_block_sycl<QK4_1, QR4_1, dequantize_q4_1>;
         case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
+            return dequantize_block_sycl<QK5_0, QR5_0, dequantize_q5_0>;
         case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
+            return dequantize_block_sycl<QK5_1, QR5_1, dequantize_q5_1>;
         case GGML_TYPE_Q8_0:
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
+            return dequantize_block_sycl<QK8_0, QR8_0, dequantize_q8_0>;
         case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
+            return dequantize_row_q2_K_sycl;
         case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
+            return dequantize_row_q3_K_sycl;
         case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
+            return dequantize_row_q4_K_sycl;
         case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
+            return dequantize_row_q5_K_sycl;
         case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
+            return dequantize_row_q6_K_sycl;
         case GGML_TYPE_F16:
-            return dequantize_block_cuda<1, 1, convert_f16>;
+            return dequantize_block_sycl<1, 1, convert_f16>;
         default:
             return nullptr;
     }
 }
 
-static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y,
+static void dequantize_mul_mat_vec_q4_0_sycl(const void *vx, const dfloat *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
@@ -6040,14 +6032,14 @@ static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y,
     }
 }
 
-static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y,
+static void dequantize_mul_mat_vec_q4_1_sycl(const void *vx, const dfloat *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
@@ -6061,14 +6053,14 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y,
     }
 }
 
-static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y,
+static void dequantize_mul_mat_vec_q5_0_sycl(const void *vx, const dfloat *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
@@ -6082,14 +6074,14 @@ static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y,
     }
 }
 
-static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y,
+static void dequantize_mul_mat_vec_q5_1_sycl(const void *vx, const dfloat *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
@@ -6103,14 +6095,14 @@ static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y,
     }
 }
 
-static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y,
+static void dequantize_mul_mat_vec_q8_0_sycl(const void *vx, const dfloat *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
@@ -6124,7 +6116,7 @@ static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y,
     }
 }
 
-static void dequantize_mul_mat_vec_q2_K_cuda(const void *vx, const float *y,
+static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
@@ -6140,7 +6132,7 @@ static void dequantize_mul_mat_vec_q2_K_cuda(const void *vx, const float *y,
         });
 }
 
-static void dequantize_mul_mat_vec_q3_K_cuda(const void *vx, const float *y,
+static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
@@ -6156,7 +6148,7 @@ static void dequantize_mul_mat_vec_q3_K_cuda(const void *vx, const float *y,
         });
 }
 
-static void dequantize_mul_mat_vec_q4_K_cuda(const void *vx, const float *y,
+static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
@@ -6172,7 +6164,7 @@ static void dequantize_mul_mat_vec_q4_K_cuda(const void *vx, const float *y,
         });
 }
 
-static void dequantize_mul_mat_vec_q5_K_cuda(const void *vx, const float *y,
+static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
@@ -6185,7 +6177,7 @@ static void dequantize_mul_mat_vec_q5_K_cuda(const void *vx, const float *y,
         });
 }
 
-static void dequantize_mul_mat_vec_q6_K_cuda(const void *vx, const float *y,
+static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
                                              float *dst, const int ncols,
                                              const int nrows,
                                              dpct::queue_ptr stream) {
@@ -6201,14 +6193,14 @@ static void dequantize_mul_mat_vec_q6_K_cuda(const void *vx, const float *y,
         });
 }
 
-static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y,
+static void convert_mul_mat_vec_f16_sycl(const void *vx, const dfloat *y,
                                          float *dst, const int ncols,
                                          const int nrows,
                                          dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    GGML_ASSERT(ncols % GGML_SYCL_DMMV_X == 0);
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
@@ -6222,14 +6214,14 @@ static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y,
     }
 }
 
-static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q4_0_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK4_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6239,14 +6231,14 @@ static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
-static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q4_1_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK4_1 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6256,14 +6248,14 @@ static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
-static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q5_0_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK5_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6273,14 +6265,14 @@ static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
-static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q5_1_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK5_1 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6290,14 +6282,14 @@ static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
-static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q8_0_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK8_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6307,14 +6299,14 @@ static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
-static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q2_K_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6324,14 +6316,14 @@ static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
-static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q3_K_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6341,14 +6333,14 @@ static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
-static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q4_K_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6358,14 +6350,14 @@ static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
-static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q5_K_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6375,14 +6367,14 @@ static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy,
         });
 }
 
-static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
+static void mul_mat_vec_q6_K_q8_1_sycl(const void *vx, const void *vy,
                                        float *dst, const int ncols,
                                        const int nrows,
                                        dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
+    const int block_num_y = (nrows + GGML_SYCL_MMV_Y - 1) / GGML_SYCL_MMV_Y;
     const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
+    const sycl::range<3> block_dims(1, GGML_SYCL_MMV_Y, WARP_SIZE);
     stream->parallel_for(
         sycl::nd_range<3>(block_nums * block_dims, block_dims),
         [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
@@ -6403,7 +6395,7 @@ int get_current_device_index(){
     return get_device_index_by_id(dpct::dev_mgr::instance().current_device_id());
 }
 
-static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -6518,7 +6510,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -6633,7 +6625,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -6748,7 +6740,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -6863,7 +6855,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -6978,7 +6970,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -7099,7 +7091,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -7229,7 +7221,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -7350,7 +7342,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -7471,7 +7463,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
+static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
                                         float *dst, const int ncols_x,
                                         const int nrows_x, const int ncols_y,
                                         const int nrows_y, const int nrows_dst,
@@ -7592,7 +7584,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y,
+static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,
                                            float *dst, const int ncols_x,
                                            const int nrows_x,
                                            const int nchannels_x,
@@ -7614,7 +7606,7 @@ static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y,
     }
 }
 
-static void ggml_mul_mat_vec_nc_f16_f32_cuda(
+static void ggml_mul_mat_vec_nc_f16_f32_sycl(
     const void *vx, const float *y, float *dst, const int ncols_x,
     const int nrows_x, const int row_stride_x, const int nchannels_x,
     const int nchannels_y, const int channel_stride_x, dpct::queue_ptr stream) {
@@ -7635,7 +7627,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_cuda(
     }
 }
 
-static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne,
+static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne,
                                   const int ne00, const int ne01,
                                   const int nb00, const int nb01,
                                   const int nb02, const int ne10,
@@ -7643,15 +7635,15 @@ static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne,
                                   const int nb11, const int nb12,
                                   dpct::queue_ptr stream) {
 
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
         stream->parallel_for(
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
                 cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
                                            nb02, ne10, ne11, nb10, nb11, nb12,
@@ -7660,7 +7652,7 @@ static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne,
     }
 }
 
-static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne,
+static void ggml_cpy_f32_f16_sycl(const char *cx, char *cdst, const int ne,
                                   const int ne00, const int ne01,
                                   const int nb00, const int nb01,
                                   const int nb02, const int ne10,
@@ -7668,15 +7660,15 @@ static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne,
                                   const int nb11, const int nb12,
                                   dpct::queue_ptr stream) {
 
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
         stream->parallel_for(
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
                 cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
                                            nb02, ne10, ne11, nb10, nb11, nb12,
@@ -7685,7 +7677,7 @@ static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne,
     }
 }
 
-static void ggml_cpy_f32_q8_0_cuda(const char *cx, char *cdst, const int ne,
+static void ggml_cpy_f32_q8_0_sycl(const char *cx, char *cdst, const int ne,
                                    const int ne00, const int ne01,
                                    const int nb00, const int nb01,
                                    const int nb02, const int ne10,
@@ -7704,7 +7696,7 @@ static void ggml_cpy_f32_q8_0_cuda(const char *cx, char *cdst, const int ne,
                          });
 }
 
-static void ggml_cpy_f32_q4_0_cuda(const char *cx, char *cdst, const int ne,
+static void ggml_cpy_f32_q4_0_sycl(const char *cx, char *cdst, const int ne,
                                    const int ne00, const int ne01,
                                    const int nb00, const int nb01,
                                    const int nb02, const int ne10,
@@ -7723,7 +7715,7 @@ static void ggml_cpy_f32_q4_0_cuda(const char *cx, char *cdst, const int ne,
                          });
 }
 
-static void ggml_cpy_f32_q4_1_cuda(const char *cx, char *cdst, const int ne,
+static void ggml_cpy_f32_q4_1_sycl(const char *cx, char *cdst, const int ne,
                                    const int ne00, const int ne01,
                                    const int nb00, const int nb01,
                                    const int nb02, const int ne10,
@@ -7742,7 +7734,7 @@ static void ggml_cpy_f32_q4_1_cuda(const char *cx, char *cdst, const int ne,
                          });
 }
 
-static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne,
+static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
                                   const int ne00, const int ne01,
                                   const int nb00, const int nb01,
                                   const int nb02, const int ne10,
@@ -7750,15 +7742,15 @@ static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne,
                                   const int nb11, const int nb12,
                                   dpct::queue_ptr stream) {
 
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
     {
         dpct::has_capability_or_fail(stream->get_device(),
                                      {sycl::aspect::fp16});
 
         stream->parallel_for(
             sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
                 cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
                                            nb02, ne10, ne11, nb10, nb11, nb12,
@@ -7767,39 +7759,39 @@ static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne,
     }
 }
 
-static void scale_f32_cuda(const float *x, float *dst, const float scale,
+static void scale_f32_sycl(const float *x, float *dst, const float scale,
                            const int k, dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_SCALE_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             scale_f32(x, dst, scale, k, item_ct1);
         });
 }
 
-static void clamp_f32_cuda(const float *x, float *dst, const float min,
+static void clamp_f32_sycl(const float *x, float *dst, const float min,
                            const float max, const int k,
                            dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
+    const int num_blocks = (k + SYCL_CLAMP_BLOCK_SIZE - 1) / SYCL_CLAMP_BLOCK_SIZE;
     stream->parallel_for(
         sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)),
+                              sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE),
+                          sycl::range<3>(1, 1, SYCL_CLAMP_BLOCK_SIZE)),
         [=](sycl::nd_item<3> item_ct1) {
             clamp_f32(x, dst, min, max, k, item_ct1);
         });
 }
 
 template <typename T>
-static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
+static void rope_sycl(const T *x, T *dst, int ncols, int nrows,
                       const int32_t *pos, float freq_scale, int p_delta_rows,
                       float freq_base, float ext_factor, float attn_factor,
                       rope_corr_dims corr_dims, dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % 2 == 0);
-    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
     const sycl::range<3> block_nums(1, num_blocks_x, nrows);
     if (pos == nullptr) {
         /*
@@ -7837,14 +7829,14 @@ static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
 }
 
 template <typename T>
-static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
+static void rope_neox_sycl(const T *x, T *dst, int ncols, int n_dims, int nrows,
                            const int32_t *pos, float freq_scale,
                            int p_delta_rows, float freq_base, float ext_factor,
                            float attn_factor, rope_corr_dims corr_dims,
                            dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % 2 == 0);
-    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
+    const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
+    const int num_blocks_x = (ncols + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
     const sycl::range<3> block_nums(1, num_blocks_x, nrows);
 
     const float theta_scale = powf(freq_base, -2.0f/n_dims);
@@ -7886,13 +7878,13 @@ static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
     }
 }
 
-static void rope_glm_f32_cuda(const float *x, float *dst, int ncols, int nrows,
+static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
                               const int32_t *pos, float freq_scale,
                               int p_delta_rows, float freq_base, int n_ctx,
                               dpct::queue_ptr stream) {
     GGML_ASSERT(ncols % 4 == 0);
-    const sycl::range<3> block_dims(1, 1, CUDA_ROPE_BLOCK_SIZE / 4);
-    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
+    const sycl::range<3> block_dims(1, 1, SYCL_ROPE_BLOCK_SIZE / 4);
+    const int num_blocks_x = (ncols + SYCL_ROPE_BLOCK_SIZE - 1) / SYCL_ROPE_BLOCK_SIZE;
     const sycl::range<3> block_nums(1, nrows, num_blocks_x);
     stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
                          [=](sycl::nd_item<3> item_ct1) {
@@ -7902,12 +7894,12 @@ static void rope_glm_f32_cuda(const float *x, float *dst, int ncols, int nrows,
                          });
 }
 
-static void alibi_f32_cuda(const float *x, float *dst, const int ncols,
+static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
                            const int nrows, const int k_rows,
                            const int n_heads_log2_floor, const float m0,
                            const float m1, dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, 1, CUDA_ALIBI_BLOCK_SIZE);
-    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
+    const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
+    const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
     const sycl::range<3> block_nums(1, nrows, num_blocks_x);
     stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
                          [=](sycl::nd_item<3> item_ct1) {
@@ -7916,7 +7908,7 @@ static void alibi_f32_cuda(const float *x, float *dst, const int ncols,
                          });
 }
 
-static void sum_rows_f32_cuda(const float *x, float *dst, const int ncols,
+static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
                               const int nrows, dpct::queue_ptr stream) {
     const sycl::range<3> block_dims(1, 1, WARP_SIZE);
     const sycl::range<3> block_nums(1, nrows, 1);
@@ -7927,7 +7919,7 @@ static void sum_rows_f32_cuda(const float *x, float *dst, const int ncols,
                              });
 }
 
-static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
+static void argsort_f32_i32_sycl(const float *x, int *dst, const int ncols,
                                  const int nrows, ggml_sort_order order,
                                  dpct::queue_ptr stream) {
     // bitonic sort requires ncols to be power of 2
@@ -7962,12 +7954,12 @@ static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
     }
 }
 
-static void diag_mask_inf_f32_cuda(const float *x, float *dst,
+static void diag_mask_inf_f32_sycl(const float *x, float *dst,
                                    const int ncols_x, const int nrows_x,
                                    const int rows_per_channel, const int n_past,
                                    dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
-    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
+    const sycl::range<3> block_dims(1, SYCL_DIAG_MASK_INF_BLOCK_SIZE, 1);
+    const int block_num_x = (ncols_x + SYCL_DIAG_MASK_INF_BLOCK_SIZE - 1) / SYCL_DIAG_MASK_INF_BLOCK_SIZE;
     const sycl::range<3> block_nums(1, block_num_x, nrows_x);
     stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
                          [=](sycl::nd_item<3> item_ct1) {
@@ -7977,12 +7969,12 @@ static void diag_mask_inf_f32_cuda(const float *x, float *dst,
                          });
 }
 
-static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
+static void soft_max_f32_sycl(const float *x, const float *y, float *dst,
                               const int ncols_x, const int nrows_x,
                               const int nrows_y, const float scale,
                               dpct::queue_ptr stream) {
     int nth = WARP_SIZE;
-    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
+    while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
     const sycl::range<3> block_dims(1, 1, nth);
     const sycl::range<3> block_nums(1, 1, nrows_x);
     /*
@@ -7992,12 +7984,12 @@ static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
     */
     stream->submit([&](sycl::handler &cgh) {
         /*
-        DPCT1101:96: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
+        DPCT1101:96: 'SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
         replaced with a value. Modify the code to use the original expression,
         provided in comments, if it is correct.
         */
         sycl::local_accessor<float, 1> buf_acc_ct1(
-            sycl::range<1>(32 /*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
+            sycl::range<1>(32 /*SYCL_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
 
         cgh.parallel_for(
             sycl::nd_range<3>(block_nums * block_dims, block_dims),
@@ -8008,13 +8000,13 @@ static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
     });
 }
 
-static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH,
+static void im2col_f32_f16_sycl(const float *x, sycl::half *dst, int IW, int IH,
                                 int OW, int OH, int KW, int KH, int IC,
                                 int offset_delta, int s0, int s1, int p0,
                                 int p1, int d0, int d1,
                                 dpct::queue_ptr stream) {
     const int parallel_elements = OW * KW * KH;
-    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
+    const int num_blocks = (parallel_elements + SYCL_IM2COL_BLOCK_SIZE - 1) / SYCL_IM2COL_BLOCK_SIZE;
     sycl::range<3> block_nums(IC, OH, num_blocks);
     {
         dpct::has_capability_or_fail(stream->get_device(),
@@ -8022,8 +8014,8 @@ static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH,
 
         stream->parallel_for(
             sycl::nd_range<3>(block_nums *
-                                  sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)),
+                                  sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)),
             [=](sycl::nd_item<3> item_ct1) {
                 im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH,
                                parallel_elements, (IC * KH * KW), s0, s1, p0,
@@ -8032,8 +8024,8 @@ static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH,
     }
 }
 
-// buffer pool for cuda
-#define MAX_CUDA_BUFFERS 256
+// buffer pool for sycl
+#define MAX_SYCL_BUFFERS 256
 
 struct scoped_spin_lock {
     std::atomic_flag& lock;
@@ -8049,33 +8041,33 @@ struct scoped_spin_lock {
     scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
 };
 
-static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+static std::atomic_flag g_sycl_pool_lock = ATOMIC_FLAG_INIT;
 
-// #define DEBUG_CUDA_MALLOC
-struct cuda_buffer {
+// #define DEBUG_SYCL_MALLOC
+struct sycl_buffer {
     void * ptr = nullptr;
     size_t size = 0;
 };
 
-static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
-static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
+static sycl_buffer g_sycl_buffer_pool[GGML_SYCL_MAX_DEVICES][MAX_SYCL_BUFFERS];
+static size_t g_sycl_pool_size[GGML_SYCL_MAX_DEVICES] = {0};
 
-static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
+static void *ggml_sycl_pool_malloc_leg(size_t size, size_t *actual_size) try {
+    scoped_spin_lock lock(g_sycl_pool_lock);
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_index()));
-    GGML_SYCL_DEBUG("ggml_cuda_pool_malloc_leg index %d\n", id);
-#ifdef DEBUG_CUDA_MALLOC
+    GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg index %d\n", id);
+#ifdef DEBUG_SYCL_MALLOC
     int nnz = 0;
     size_t max_size = 0;
 #endif
     size_t best_diff = 1ull << 36;
     int ibest = -1;
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+    for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+        sycl_buffer& b = g_sycl_buffer_pool[id][i];
         if (b.ptr != nullptr) {
-#ifdef DEBUG_CUDA_MALLOC
+#ifdef DEBUG_SYCL_MALLOC
             ++nnz;
             if (b.size > max_size) max_size = b.size;
 #endif
@@ -8089,7 +8081,7 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
                         *actual_size = b.size;
                         b.ptr = nullptr;
                         b.size = 0;
-                        // GGML_SYCL_DEBUG("ggml_cuda_pool_malloc_leg return 1 %p\n", ptr);
+                        // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return 1 %p\n", ptr);
                         return ptr;
                     }
                 }
@@ -8097,12 +8089,12 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
         }
     }
     if (ibest >= 0) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
+        sycl_buffer& b = g_sycl_buffer_pool[id][ibest];
         void * ptr = b.ptr;
         *actual_size = b.size;
         b.ptr = nullptr;
         b.size = 0;
-        // GGML_SYCL_DEBUG("ggml_cuda_pool_malloc_leg return 2 %p\n", ptr);
+        // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return 2 %p\n", ptr);
         return ptr;
     }
     void * ptr;
@@ -8112,13 +8104,13 @@ static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
         CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
                              look_ahead_size, dpct::get_in_order_queue())));
     *actual_size = look_ahead_size;
-    g_cuda_pool_size[id] += look_ahead_size;
+    g_sycl_pool_size[id] += look_ahead_size;
 
-#ifdef DEBUG_CUDA_MALLOC
+#ifdef DEBUG_SYCL_MALLOC
     fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
-            (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
+            (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
 #endif
-    // GGML_SYCL_DEBUG("ggml_cuda_pool_malloc_leg return %p\n", ptr);
+    // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return %p\n", ptr);
     return ptr;
 }
 catch (sycl::exception const &exc) {
@@ -8127,23 +8119,23 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
+static void ggml_sycl_pool_free_leg(void *ptr, size_t size) try {
+    scoped_spin_lock lock(g_sycl_pool_lock);
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_index()));
 
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][i];
+    for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
+        sycl_buffer& b = g_sycl_buffer_pool[id][i];
         if (b.ptr == nullptr) {
             b.ptr = ptr;
             b.size = size;
             return;
         }
     }
-    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_SYCL_BUFFERS\n");
     SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
-    g_cuda_pool_size[id] -= size;
+    g_sycl_pool_size[id] -= size;
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -8156,12 +8148,12 @@ catch (sycl::exception const &exc) {
 DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.
 */
 // static std::vector<CUmemGenericAllocationHandle>
-//     g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
-static dpct::device_ptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
-static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
-static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
+//     g_sycl_pool_handles[GGML_SYCL_MAX_DEVICES];
+static dpct::device_ptr g_sycl_pool_addr[GGML_SYCL_MAX_DEVICES] = {0};
+static size_t g_sycl_pool_used[GGML_SYCL_MAX_DEVICES] = {0};
+static const size_t SYCL_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
 
-static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try {
+static void *ggml_sycl_pool_malloc_vmm(size_t size, size_t *actual_size) try {
 
     return NULL;
 }
@@ -8171,20 +8163,20 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
+static void ggml_sycl_pool_free_vmm(void *ptr, size_t size) try {
+    scoped_spin_lock lock(g_sycl_pool_lock);
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
 
-#ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
+#ifdef DEBUG_SYCL_MALLOC
+    printf("sycl pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
 #endif
 
-    g_cuda_pool_used[id] -= size;
+    g_sycl_pool_used[id] -= size;
 
     // all deallocations must be in reverse order of the allocations
-    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
+    GGML_ASSERT(ptr == (void *) (g_sycl_pool_addr[id] + g_sycl_pool_used[id]));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -8192,14 +8184,14 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
+static void *ggml_sycl_pool_malloc(size_t size, size_t *actual_size) try {
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_index()));
     if (g_device_caps[id].vmm) {
-        return ggml_cuda_pool_malloc_vmm(size, actual_size);
+        return ggml_sycl_pool_malloc_vmm(size, actual_size);
     } else {
-        return ggml_cuda_pool_malloc_leg(size, actual_size);
+        return ggml_sycl_pool_malloc_leg(size, actual_size);
     }
 }
 catch (sycl::exception const &exc) {
@@ -8208,14 +8200,14 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_pool_free(void *ptr, size_t size) try {
+static void ggml_sycl_pool_free(void *ptr, size_t size) try {
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_index()));
     if (g_device_caps[id].vmm) {
-        ggml_cuda_pool_free_vmm(ptr, size);
+        ggml_sycl_pool_free_vmm(ptr, size);
     } else {
-        ggml_cuda_pool_free_leg(ptr, size);
+        ggml_sycl_pool_free_leg(ptr, size);
     }
 }
 catch (sycl::exception const &exc) {
@@ -8226,25 +8218,25 @@ catch (sycl::exception const &exc) {
 
 
 template<typename T>
-struct cuda_pool_alloc {
+struct sycl_pool_alloc {
     T * ptr = nullptr;
     size_t actual_size = 0;
 
     // size is in number of elements
     T * alloc(size_t size) {
         GGML_ASSERT(ptr == nullptr);
-        ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size);
+        ptr = (T *) ggml_sycl_pool_malloc(size * sizeof(T), &this->actual_size);
         GGML_SYCL_DEBUG("alloc %lu return %p actual size=%lu\n", size * sizeof(T), ptr, this->actual_size);
         return ptr;
     }
 
-    cuda_pool_alloc(size_t size) {
+    sycl_pool_alloc(size_t size) {
         alloc(size);
     }
 
-    ~cuda_pool_alloc() {
+    ~sycl_pool_alloc() {
         if (ptr != nullptr) {
-            ggml_cuda_pool_free(ptr, actual_size);
+            ggml_sycl_pool_free(ptr, actual_size);
         }
     }
 
@@ -8252,17 +8244,17 @@ struct cuda_pool_alloc {
         return ptr;
     }
 
-    cuda_pool_alloc() = default;
-    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
-    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
-    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
-    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
+    sycl_pool_alloc() = default;
+    sycl_pool_alloc(const sycl_pool_alloc &) = delete;
+    sycl_pool_alloc(sycl_pool_alloc &&) = delete;
+    sycl_pool_alloc& operator=(const sycl_pool_alloc &) = delete;
+    sycl_pool_alloc& operator=(sycl_pool_alloc &&) = delete;
 };
 
-static bool g_cublas_loaded = false;
+static bool g_sycl_loaded = false;
 
-bool ggml_cublas_loaded(void) {
-    return g_cublas_loaded;
+bool ggml_sycl_loaded(void) {
+    return g_sycl_loaded;
 }
 void print_devices(){
     int device_count = dpct::dev_mgr::instance().device_count();
@@ -8295,7 +8287,7 @@ int get_sycl_env(const char* env_name, int default_val){
     return user_number;
 }
 
-void ggml_init_cublas() try {
+void ggml_init_sycl() try {
     static bool initialized = false;
 
     if (!initialized) {
@@ -8318,11 +8310,11 @@ void ggml_init_cublas() try {
                                  dpct::dev_mgr::instance().device_count()) !=
             0) {
             initialized = true;
-            g_cublas_loaded = false;
+            g_sycl_loaded = false;
             return;
         }
 
-        GGML_ASSERT(g_all_sycl_device_count <= GGML_CUDA_MAX_DEVICES);
+        GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
         int64_t total_vram = 0;
 
 #if defined(GGML_SYCL_FP16)
@@ -8332,13 +8324,13 @@ void ggml_init_cublas() try {
 #endif
 
 
-#if defined(CUDA_USE_TENSOR_CORES)
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
+#if defined(SYCL_USE_XMX)
+        fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__);
 #else
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
+        fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
 #endif
 
-        for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) {
+        for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
             g_sycl_device_id2index[id].index = -1;
             g_device_caps[id].vmm = 0;
             g_device_caps[id].device_id = -1;
@@ -8361,10 +8353,7 @@ void ggml_init_cublas() try {
             dpct::device_info prop;
             SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
                 prop, dpct::dev_mgr::instance().get_device(id))));
-            /*
-            DPCT1005:86: The SYCL device version is different from CUDA Compute
-            Compatibility. You may need to rewrite this code.
-            */
+
             fprintf(stderr,
                     "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id,
                     prop.get_name(), prop.get_major_version(),
@@ -8372,13 +8361,10 @@ void ggml_init_cublas() try {
 
             g_tensor_split[device_inx] = total_vram;
             total_vram += prop.get_global_mem_size();
-            /*
-            DPCT1005:87: The SYCL device version is different from CUDA Compute
-            Compatibility. You may need to rewrite this code.
-            */
+
             g_device_caps[device_inx].cc =
                 100 * prop.get_major_version() + 10 * prop.get_minor_version();
-              // g_device_caps[id].cc = 9000;
+
             printf("g_device_caps[%d].cc=%d\n", device_inx, g_device_caps[device_inx].cc);
         }
         device_inx = -1;
@@ -8392,36 +8378,36 @@ void ggml_init_cublas() try {
         for (int id = 0; id < g_all_sycl_device_count; ++id) {
             if(id!=user_device_number) continue;
             device_inx++;
-            SYCL_CHECK(ggml_cuda_set_device(id));
+            SYCL_CHECK(ggml_sycl_set_device(id));
 
-            // create cuda streams
+            // create sycl streams
             for (int is = 0; is < MAX_STREAMS; ++is) {
                 /*
                 DPCT1025:88: The SYCL queue is created ignoring the flag and
                 priority options.
                 */
                 SYCL_CHECK(CHECK_TRY_ERROR(
-                    g_cudaStreams[device_inx][is] =
+                    g_syclStreams[device_inx][is] =
                         dpct::get_current_device().create_queue()));
             }
 
-            // create cublas handle
-            SYCL_CHECK(CHECK_TRY_ERROR(g_cublas_handles[device_inx] =
+            // create sycl handle
+            SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[device_inx] =
                                               &dpct::get_in_order_queue()));
             /*
-            DPCT1027:89: The call to cublasSetMathMode was replaced with 0
+            DPCT1027:89: The call to syclSetMathMode was replaced with 0
             because this functionality is redundant in SYCL.
             */
             SYCL_CHECK(0);
         }
 
         // configure logging to stdout
-        // SYCL_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+        // SYCL_CHECK(syclLoggerConfigure(1, 1, 0, nullptr));
 
         //hardcode, force set to 1 device
         g_device_count = 1;
-        ggml_cuda_set_main_device(user_device_number);
-        ggml_cuda_set_device(user_device_number);
+        ggml_sycl_set_main_device(user_device_number);
+        ggml_sycl_set_device(user_device_number);
         fprintf(stderr, "Using Device %d\n", user_device_number);
 
         // for (int id = 0; id < g_all_sycl_device_count; ++id) {
@@ -8430,7 +8416,7 @@ void ggml_init_cublas() try {
         // }
 
         initialized = true;
-        g_cublas_loaded = true;
+        g_sycl_loaded = true;
     }
 }
 catch (sycl::exception const &exc) {
@@ -8440,7 +8426,7 @@ catch (sycl::exception const &exc) {
 }
 
 
-void ggml_cuda_set_tensor_split(const float * tensor_split) {
+void ggml_sycl_set_tensor_split(const float * tensor_split) {
     if (tensor_split == nullptr) {
         return;
     }
@@ -8464,8 +8450,8 @@ void ggml_cuda_set_tensor_split(const float * tensor_split) {
     }
 }
 
-void *ggml_cuda_host_malloc(size_t size) try {
-    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+void *ggml_sycl_host_malloc(size_t size) try {
+    if (getenv("GGML_SYCL_NO_PINNED") != nullptr) {
         return nullptr;
     }
 
@@ -8478,7 +8464,7 @@ void *ggml_cuda_host_malloc(size_t size) try {
     if (err != 0) {
         // clear the error
         /*
-        DPCT1026:83: The call to cudaGetLastError was removed because this
+        DPCT1026:83: The call to syclGetLastError was removed because this
         functionality is redundant in SYCL.
         */
         /*
@@ -8493,7 +8479,7 @@ void *ggml_cuda_host_malloc(size_t size) try {
             string was inserted. You need to rewrite this code.
             */
             size / 1024.0 / 1024.0,
-            "cudaGetErrorString is not supported" /*cudaGetErrorString(err)*/);
+            "syclGetErrorString is not supported" /*syclGetErrorString(err)*/);
         return nullptr;
     }
 
@@ -8505,7 +8491,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-void ggml_cuda_host_free(void *ptr) try {
+void ggml_sycl_host_free(void *ptr) try {
     SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
 }
 catch (sycl::exception const &exc) {
@@ -8514,7 +8500,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
+static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
                                           const struct ggml_tensor *src,
                                           int64_t i3, int64_t i2,
                                           int64_t i1_low, int64_t i1_high,
@@ -8525,7 +8511,7 @@ static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
     if (src->backend == GGML_BACKEND_CPU) {
         kind = dpct::host_to_device;
         src_ptr = (char *) src->data;
-        GGML_SYCL_DEBUG("ggml_cuda_cpy_tensor_2d  GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
+        GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
     } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
         GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
         kind = dpct::device_to_device;
@@ -8587,7 +8573,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_op_get_rows(const ggml_tensor *src0,
+static void ggml_sycl_op_get_rows(const ggml_tensor *src0,
                                   const ggml_tensor *src1, ggml_tensor *dst,
                                   const float *src0_d, const float *src1_d,
                                   float *dst_d, const dpct::queue_ptr &stream) {
@@ -8603,26 +8589,26 @@ static void ggml_cuda_op_get_rows(const ggml_tensor *src0,
 
     switch (src0->type) {
         case GGML_TYPE_F16:
-            get_rows_cuda_float(src0, src1, dst, (const sycl::half *)src0_d,
+            get_rows_sycl_float(src0, src1, dst, (const sycl::half *)src0_d,
                                 src1_i32, dst_d, stream);
             break;
         case GGML_TYPE_F32:
-            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            get_rows_sycl_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
             break;
         case GGML_TYPE_Q4_0:
-            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            get_rows_sycl<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
             break;
         case GGML_TYPE_Q4_1:
-            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            get_rows_sycl<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
             break;
         case GGML_TYPE_Q5_0:
-            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            get_rows_sycl<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
             break;
         case GGML_TYPE_Q5_1:
-            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            get_rows_sycl<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
             break;
         case GGML_TYPE_Q8_0:
-            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
+            get_rows_sycl<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
             break;
         default:
             // TODO: k-quants
@@ -8633,7 +8619,7 @@ static void ggml_cuda_op_get_rows(const ggml_tensor *src0,
 }
 
 template <class op>
-inline void ggml_cuda_op_bin_bcast(const ggml_tensor *src0,
+inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
                                    const ggml_tensor *src1, ggml_tensor *dst,
                                    const float *src0_dd, const float *src1_dd,
                                    float *dst_dd,
@@ -8656,27 +8642,27 @@ inline void ggml_cuda_op_bin_bcast(const ggml_tensor *src0,
     }
 }
 
-static void ggml_cuda_op_repeat(const ggml_tensor *src0,
+static void ggml_sycl_op_repeat(const ggml_tensor *src0,
                                 const ggml_tensor *src1, ggml_tensor *dst,
                                 const float *src0_d, const float *src1_d,
                                 float *dst_d,
                                 const dpct::queue_ptr &main_stream) {
 
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
 
     (void) src1;
     (void) src1_d;
 }
 
-inline void ggml_cuda_op_add(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_add(const ggml_tensor *src0, const ggml_tensor *src1,
                              ggml_tensor *dst, const float *src0_dd,
                              const float *src1_dd, float *dst_dd,
                              const dpct::queue_ptr &main_stream) {
 
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
 }
 
-inline void ggml_cuda_op_acc(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_acc(const ggml_tensor *src0, const ggml_tensor *src1,
                              ggml_tensor *dst, const float *src0_dd,
                              const float *src1_dd, float *dst_dd,
                              const dpct::queue_ptr &main_stream) {
@@ -8691,28 +8677,28 @@ inline void ggml_cuda_op_acc(const ggml_tensor *src0, const ggml_tensor *src1,
     // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
     int offset = dst->op_params[3] / 4; // offset in bytes
 
-    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
+    acc_f32_sycl(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
 
     (void) dst;
 }
 
-inline void ggml_cuda_op_mul(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_mul(const ggml_tensor *src0, const ggml_tensor *src1,
                              ggml_tensor *dst, const float *src0_dd,
                              const float *src1_dd, float *dst_dd,
                              const dpct::queue_ptr &main_stream) {
 
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
 }
 
-inline void ggml_cuda_op_div(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_div(const ggml_tensor *src0, const ggml_tensor *src1,
                              ggml_tensor *dst, const float *src0_dd,
                              const float *src1_dd, float *dst_dd,
                              const dpct::queue_ptr &main_stream) {
 
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
+    ggml_sycl_op_bin_bcast<bin_bcast_sycl<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
 }
 
-inline void ggml_cuda_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1,
                               ggml_tensor *dst, const float *src0_dd,
                               const float *src1_dd, float *dst_dd,
                               const dpct::queue_ptr &main_stream) {
@@ -8720,14 +8706,14 @@ inline void ggml_cuda_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1,
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+    gelu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_silu(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_silu(const ggml_tensor *src0, const ggml_tensor *src1,
                               ggml_tensor *dst, const float *src0_dd,
                               const float *src1_dd, float *dst_dd,
                               const dpct::queue_ptr &main_stream) {
@@ -8735,14 +8721,14 @@ inline void ggml_cuda_op_silu(const ggml_tensor *src0, const ggml_tensor *src1,
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+    silu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_gelu_quick(const ggml_tensor *src0,
+inline void ggml_sycl_op_gelu_quick(const ggml_tensor *src0,
                                     const ggml_tensor *src1, ggml_tensor *dst,
                                     const float *src0_dd, const float *src1_dd,
                                     float *dst_dd,
@@ -8751,14 +8737,14 @@ inline void ggml_cuda_op_gelu_quick(const ggml_tensor *src0,
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+    gelu_quick_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1,
                               ggml_tensor *dst, const float *src0_dd,
                               const float *src1_dd, float *dst_dd,
                               const dpct::queue_ptr &main_stream) {
@@ -8766,14 +8752,14 @@ inline void ggml_cuda_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1,
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+    tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_relu(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_relu(const ggml_tensor *src0, const ggml_tensor *src1,
                               ggml_tensor *dst, const float *src0_dd,
                               const float *src1_dd, float *dst_dd,
                               const dpct::queue_ptr &main_stream) {
@@ -8781,14 +8767,14 @@ inline void ggml_cuda_op_relu(const ggml_tensor *src0, const ggml_tensor *src1,
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+    relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_leaky_relu(const ggml_tensor *src0,
+inline void ggml_sycl_op_leaky_relu(const ggml_tensor *src0,
                                     const ggml_tensor *src1, ggml_tensor *dst,
                                     const float *src0_dd, const float *src1_dd,
                                     float *dst_dd,
@@ -8800,14 +8786,14 @@ inline void ggml_cuda_op_leaky_relu(const ggml_tensor *src0,
     float negative_slope;
     memcpy(&negative_slope, dst->op_params, sizeof(float));
 
-    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
+    leaky_relu_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1,
                              ggml_tensor *dst, const float *src0_dd,
                              const float *src1_dd, float *dst_dd,
                              const dpct::queue_ptr &main_stream) {
@@ -8815,14 +8801,14 @@ inline void ggml_cuda_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1,
     GGML_ASSERT(src0->type == GGML_TYPE_F32);
     GGML_ASSERT( dst->type == GGML_TYPE_F32);
 
-    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
+    sqr_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_norm(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_norm(const ggml_tensor *src0, const ggml_tensor *src1,
                               ggml_tensor *dst, const float *src0_dd,
                               const float *src1_dd, float *dst_dd,
                               const dpct::queue_ptr &main_stream) {
@@ -8836,14 +8822,14 @@ inline void ggml_cuda_op_norm(const ggml_tensor *src0, const ggml_tensor *src1,
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
 
-    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+    norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_group_norm(const ggml_tensor *src0,
+inline void ggml_sycl_op_group_norm(const ggml_tensor *src0,
                                     const ggml_tensor *src1, ggml_tensor *dst,
                                     const float *src0_dd, const float *src1_dd,
                                     float *dst_dd,
@@ -8854,14 +8840,14 @@ inline void ggml_cuda_op_group_norm(const ggml_tensor *src0,
 
     int num_groups = dst->op_params[0];
     int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
+    group_norm_f32_sycl(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_concat(const ggml_tensor *src0,
+inline void ggml_sycl_op_concat(const ggml_tensor *src0,
                                 const ggml_tensor *src1, ggml_tensor *dst,
                                 const float *src0_dd, const float *src1_dd,
                                 float *dst_dd,
@@ -8872,14 +8858,14 @@ inline void ggml_cuda_op_concat(const ggml_tensor *src0,
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
 
     for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
+        concat_f32_sycl(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
     }
 
     (void) src1;
     (void) dst;
 }
 
-inline void ggml_cuda_op_upscale(const ggml_tensor *src0,
+inline void ggml_sycl_op_upscale(const ggml_tensor *src0,
                                  const ggml_tensor *src1, ggml_tensor *dst,
                                  const float *src0_dd, const float *src1_dd,
                                  float *dst_dd,
@@ -8891,14 +8877,14 @@ inline void ggml_cuda_op_upscale(const ggml_tensor *src0,
 
     const int scale_factor = dst->op_params[0];
 
-    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
+    upscale_f32_sycl(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
                              ggml_tensor *dst, const float *src0_dd,
                              const float *src1_dd, float *dst_dd,
                              const dpct::queue_ptr &main_stream) {
@@ -8907,7 +8893,7 @@ inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
     GGML_ASSERT(dst->type == GGML_TYPE_F32);
     GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
 
-    pad_f32_cuda(src0_dd, dst_dd,
+    pad_f32_sycl(src0_dd, dst_dd,
         src0->ne[0], src0->ne[1], src0->ne[2],
         dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
 
@@ -8916,7 +8902,7 @@ inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_rms_norm(const ggml_tensor *src0,
+inline void ggml_sycl_op_rms_norm(const ggml_tensor *src0,
                                   const ggml_tensor *src1, ggml_tensor *dst,
                                   const float *src0_dd, const float *src1_dd,
                                   float *dst_dd,
@@ -8931,14 +8917,14 @@ inline void ggml_cuda_op_rms_norm(const ggml_tensor *src0,
     float eps;
     memcpy(&eps, dst->op_params, sizeof(float));
 
-    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
+    rms_norm_f32_sycl(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_mul_mat_q(
+inline void ggml_sycl_op_mul_mat_q(
     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
     const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
@@ -8964,34 +8950,34 @@ inline void ggml_cuda_op_mul_mat_q(
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         case GGML_TYPE_Q4_1:
-            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         case GGML_TYPE_Q5_0:
-            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         case GGML_TYPE_Q5_1:
-            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         case GGML_TYPE_Q8_0:
-            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         case GGML_TYPE_Q2_K:
-            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         case GGML_TYPE_Q3_K:
-            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         case GGML_TYPE_Q4_K:
-            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         case GGML_TYPE_Q5_K:
-            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         case GGML_TYPE_Q6_K:
-            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
+            ggml_mul_mat_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
             break;
         default:
             GGML_ASSERT(false);
@@ -9045,7 +9031,7 @@ static int64_t get_row_rounding(ggml_type type) {
     }
 }
 
-inline void ggml_cuda_op_mul_mat_vec_q(
+inline void ggml_sycl_op_mul_mat_vec_q(
     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
     const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
@@ -9059,34 +9045,34 @@ inline void ggml_cuda_op_mul_mat_vec_q(
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q4_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q4_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q5_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q5_1_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q8_0_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q2_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q3_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q4_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q5_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
+            mul_mat_vec_q6_K_q8_1_sycl(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
             break;
         default:
             GGML_ASSERT(false);
@@ -9100,7 +9086,7 @@ inline void ggml_cuda_op_mul_mat_vec_q(
     (void) src1_padded_row_size;
 }
 
-inline void ggml_cuda_op_dequantize_mul_mat_vec(
+inline void ggml_sycl_op_dequantize_mul_mat_vec(
     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
     const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
@@ -9112,7 +9098,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
 
     // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
 #ifdef GGML_SYCL_F16
-    cuda_pool_alloc<sycl::half> src1_dfloat_a;
+    sycl_pool_alloc<sycl::half> src1_dfloat_a;
     sycl::half *src1_dfloat = nullptr; // dfloat == half
 
     bool src1_convert_f16 =
@@ -9122,7 +9108,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
 
     if (src1_convert_f16) {
         src1_dfloat = src1_dfloat_a.alloc(ne00);
-        ggml_cpy_f32_f16_cuda((const char *)src1_ddf_i, (char *)src1_dfloat,
+        ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
                               ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1,
                               sizeof(sycl::half), 0, 0, stream);
     }
@@ -9132,37 +9118,37 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
 
     switch (src0->type) {
         case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q4_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q4_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q5_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q5_1_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q8_0_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q2_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q3_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q4_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q5_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
+            dequantize_mul_mat_vec_q6_K_sycl(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
             break;
         case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
+            convert_mul_mat_vec_f16_sycl(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
             break;
         default:
             GGML_ASSERT(false);
@@ -9176,7 +9162,7 @@ inline void ggml_cuda_op_dequantize_mul_mat_vec(
     (void) src1_padded_row_size;
 }
 
-inline void ggml_cuda_op_mul_mat_cublas(
+inline void ggml_sycl_op_mul_mat_sycl(
     const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
     const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
     float *dst_dd_i, const int64_t row_low, const int64_t row_high,
@@ -9209,68 +9195,68 @@ inline void ggml_cuda_op_mul_mat_cublas(
     if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
 
         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        // GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp16 path\n");
-        cuda_pool_alloc<sycl::half> src0_as_f16;
+        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");
+        sycl_pool_alloc<sycl::half> src0_as_f16;
         if (src0->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
+            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src0->type);
+            GGML_ASSERT(to_fp16_sycl != nullptr);
             size_t ne = row_diff*ne00;
             src0_as_f16.alloc(ne);
-            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
+            to_fp16_sycl(src0_dd_i, src0_as_f16.get(), ne, stream);
         }
         const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
                                          ? (const sycl::half *)src0_dd_i
                                          : src0_as_f16.get();
 
-        cuda_pool_alloc<sycl::half> src1_as_f16;
+        sycl_pool_alloc<sycl::half> src1_as_f16;
         if (src1->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
+            const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+            GGML_ASSERT(to_fp16_sycl != nullptr);
             size_t ne = src1_ncols*ne10;
             src1_as_f16.alloc(ne);
-            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
+            to_fp16_sycl(src1_ddf_i, src1_as_f16.get(), ne, stream);
         }
         const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
                                          ? (const sycl::half *)src1_ddf_i
                                          : src1_as_f16.get();
-        cuda_pool_alloc<sycl::half> dst_f16(row_diff * src1_ncols);
+        sycl_pool_alloc<sycl::half> dst_f16(row_diff * src1_ncols);
 
         const sycl::half alpha_f16 = 1.0f;
         const sycl::half beta_f16 = 0.0f;
 
-        SYCL_CHECK(CHECK_TRY_ERROR(g_cublas_handles[id] = stream));
+        SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[id] = stream));
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
-            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
+            *g_sycl_handles[id], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
             &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
             src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
             dst_f16.get(), dpct::library_data_t::real_half, ldc,
             dpct::library_data_t::real_half)));
 
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
+        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
+        to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
     }
     else {
-        // GGML_SYCL_DEBUG("ggml_cuda_op_mul_mat_cublas - fp32 path\n");
-        cuda_pool_alloc<float> src0_ddq_as_f32;
+        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n");
+        sycl_pool_alloc<float> src0_ddq_as_f32;
 
         if (src0->type != GGML_TYPE_F32) {
-            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
-            GGML_ASSERT(to_fp32_cuda != nullptr);
+            const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type);
+            GGML_ASSERT(to_fp32_sycl != nullptr);
             src0_ddq_as_f32.alloc(row_diff*ne00);
-            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
+            to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
         }
         const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
 
         const float alpha = 1.0f;
         const float beta = 0.0f;
 
-        SYCL_CHECK(CHECK_TRY_ERROR(g_cublas_handles[id] = stream));
+        SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[id] = stream));
         SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
-            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
+            *g_sycl_handles[id], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
-            dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00,
-            src1_ddf_i, ne10, dpct::get_value(&beta, *g_cublas_handles[id]),
+            dpct::get_value(&alpha, *g_sycl_handles[id]), src0_ddf_i, ne00,
+            src1_ddf_i, ne10, dpct::get_value(&beta, *g_sycl_handles[id]),
             dst_dd_i, ldc)));
     }
 
@@ -9284,7 +9270,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
                               ggml_tensor *dst, const float *src0_dd,
                               const float *src1_dd, float *dst_dd,
                               const dpct::queue_ptr &main_stream) {
@@ -9329,15 +9315,15 @@ inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
     // compute
     if (is_glm) {
         GGML_ASSERT(false);
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
+        rope_glm_f32_sycl(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
     } else if (is_neox) {
         if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda(
+            rope_neox_sycl(
                 (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                 attn_factor, corr_dims, main_stream
             );
         } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
+            rope_neox_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
                            ne00, n_dims, nrows, pos, freq_scale, ne01,
                            freq_base, ext_factor, attn_factor, corr_dims,
                            main_stream);
@@ -9346,12 +9332,12 @@ inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
         }
     } else {
         if (src0->type == GGML_TYPE_F32) {
-            rope_cuda(
+            rope_sycl(
                 (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                 attn_factor, corr_dims, main_stream
             );
         } else if (src0->type == GGML_TYPE_F16) {
-            rope_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
+            rope_sycl((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
                       nrows, pos, freq_scale, ne01, freq_base, ext_factor,
                       attn_factor, corr_dims, main_stream);
         } else {
@@ -9364,7 +9350,7 @@ inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
                                ggml_tensor *dst, const float *src0_dd,
                                const float *src1_dd, float *dst_dd,
                                const dpct::queue_ptr &main_stream) {
@@ -9390,13 +9376,13 @@ inline void ggml_cuda_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
     const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
     const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
 
-    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
+    alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
 
     (void) src1;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_im2col(const ggml_tensor *src0,
+inline void ggml_sycl_op_im2col(const ggml_tensor *src0,
                                 const ggml_tensor *src1, ggml_tensor *dst,
                                 const float *src0_dd, const float *src1_dd,
                                 float *dst_dd,
@@ -9427,14 +9413,14 @@ inline void ggml_cuda_op_im2col(const ggml_tensor *src0,
 
     const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
 
-    im2col_f32_f16_cuda(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
+    im2col_f32_f16_sycl(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
                         IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
 
     (void) src0;
     (void) src0_dd;
 }
 
-inline void ggml_cuda_op_sum_rows(const ggml_tensor *src0,
+inline void ggml_sycl_op_sum_rows(const ggml_tensor *src0,
                                   const ggml_tensor *src1, ggml_tensor *dst,
                                   const float *src0_dd, const float *src1_dd,
                                   float *dst_dd,
@@ -9446,14 +9432,14 @@ inline void ggml_cuda_op_sum_rows(const ggml_tensor *src0,
     const int64_t ncols = src0->ne[0];
     const int64_t nrows = ggml_nrows(src0);
 
-    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
+    sum_rows_f32_sycl(src0_dd, dst_dd, ncols, nrows, main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_argsort(const ggml_tensor *src0,
+inline void ggml_sycl_op_argsort(const ggml_tensor *src0,
                                  const ggml_tensor *src1, ggml_tensor *dst,
                                  const float *src0_dd, const float *src1_dd,
                                  float *dst_dd,
@@ -9467,14 +9453,14 @@ inline void ggml_cuda_op_argsort(const ggml_tensor *src0,
 
     enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
 
-    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
+    argsort_f32_i32_sycl(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_diag_mask_inf(const ggml_tensor *src0,
+inline void ggml_sycl_op_diag_mask_inf(const ggml_tensor *src0,
                                        const ggml_tensor *src1,
                                        ggml_tensor *dst, const float *src0_dd,
                                        const float *src1_dd, float *dst_dd,
@@ -9489,14 +9475,14 @@ inline void ggml_cuda_op_diag_mask_inf(const ggml_tensor *src0,
 
     const int n_past = ((int32_t *) dst->op_params)[0];
 
-    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
+    diag_mask_inf_f32_sycl(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
 
     (void) src1;
     (void) dst;
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_soft_max(const ggml_tensor *src0,
+inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
                                   const ggml_tensor *src1, ggml_tensor *dst,
                                   const float *src0_dd, const float *src1_dd,
                                   float *dst_dd,
@@ -9514,12 +9500,12 @@ inline void ggml_cuda_op_soft_max(const ggml_tensor *src0,
     float scale = 1.0f;
     memcpy(&scale, dst->op_params, sizeof(float));
 
-    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
+    soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
 
     (void) dst;
 }
 
-inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
                                ggml_tensor *dst, const float *src0_dd,
                                const float *src1_dd, float *dst_dd,
                                const dpct::queue_ptr &main_stream) {
@@ -9530,7 +9516,7 @@ inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
     float scale;
     memcpy(&scale, dst->op_params, sizeof(float));
 
-    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
+    scale_f32_sycl(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
     /*
     DPCT1010:87: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
@@ -9542,7 +9528,7 @@ inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
     (void) src1_dd;
 }
 
-inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
+inline void ggml_sycl_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
                                ggml_tensor *dst, const float *src0_dd,
                                const float *src1_dd, float *dst_dd,
                                const dpct::queue_ptr &main_stream) {
@@ -9555,7 +9541,7 @@ inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
     memcpy(&min, dst->op_params, sizeof(float));
     memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
 
-    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
+    clamp_f32_sycl(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
     /*
     DPCT1010:88: SYCL uses exceptions to report errors and does not use the
     error codes. The call was replaced with 0. You need to rewrite this code.
@@ -9567,9 +9553,9 @@ inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
     (void) src1_dd;
 }
 
-static void ggml_cuda_op_flatten(const ggml_tensor *src0,
+static void ggml_sycl_op_flatten(const ggml_tensor *src0,
                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                 const ggml_cuda_op_flatten_t op) try {
+                                 const ggml_sycl_op_flatten_t op) try {
     const int64_t nrows0 = ggml_nrows(src0);
 
     const bool use_src1 = src1 != nullptr;
@@ -9591,12 +9577,12 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
     float * src1_ddf = nullptr;
     float *  dst_ddf = nullptr;
 
-    cuda_pool_alloc<float> src0_f;
-    cuda_pool_alloc<float> src1_f;
-    cuda_pool_alloc<float>  dst_f;
+    sycl_pool_alloc<float> src0_f;
+    sycl_pool_alloc<float> src1_f;
+    sycl_pool_alloc<float>  dst_f;
 
-    ggml_cuda_set_device(g_main_device);
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
+    ggml_sycl_set_device(g_main_device);
+    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
     GGML_SYCL_DEBUG("g_main_device_index=%d, src0=%p\n", g_main_device_index, src0);
 
     if (src0_on_device) {
@@ -9605,8 +9591,8 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
         src0_ddf = src0_f.alloc(ggml_nelements(src0));
         GGML_SYCL_DEBUG("g_main_device_index=%d, src0_ddf=%p\n", g_main_device_index, src0_ddf);
 
-        GGML_SYCL_DEBUG("before ggml_cuda_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
-        SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
+        GGML_SYCL_DEBUG("before ggml_sycl_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
+        SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
     }
 
     if (use_src1) {
@@ -9614,7 +9600,7 @@ static void ggml_cuda_op_flatten(const ggml_tensor *src0,
             src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
         } else {
             src1_ddf = src1_f.alloc(ggml_nelements(src1));
-            SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
+            SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
         }
     }
     if (dst_on_device) {
@@ -9651,10 +9637,10 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_set_peer_access(const int n_tokens) {
+static void ggml_sycl_set_peer_access(const int n_tokens) {
     static bool peer_access_enabled = false;
 
-    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
+    const bool enable_peer_access = n_tokens <= GGML_SYCL_PEER_MAX_BATCH_SIZE;
 
     if (peer_access_enabled == enable_peer_access) {
         return;
@@ -9662,12 +9648,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
 
 #ifdef NDEBUG
     for (int id = 0; id < g_device_count; ++id) {
-        SYCL_CHECK(ggml_cuda_set_device(g_device_caps[id].device_id));
-        // SYCL_CHECK(cudaDeviceSynchronize());
+        SYCL_CHECK(ggml_sycl_set_device(g_device_caps[id].device_id));
+        // SYCL_CHECK(syclDeviceSynchronize());
     }
 
     for (int id = 0; id < g_device_count; ++id) {
-        SYCL_CHECK(ggml_cuda_set_device(g_device_caps[id].device_id));
+        SYCL_CHECK(ggml_sycl_set_device(g_device_caps[id].device_id));
         int device_id = g_device_caps[id].device_id;
 
         for (int id_other = 0; id_other < g_device_count; ++id_other) {
@@ -9680,12 +9666,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
             }
 
             int can_access_peer;
-            // SYCL_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
+            // SYCL_CHECK(syclDeviceCanAccessPeer(&can_access_peer, id, id_other));
             // if (can_access_peer) {
             //     if (enable_peer_access) {
-            //         SYCL_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
+            //         SYCL_CHECK(syclDeviceEnablePeerAccess(id_other, 0));
             //     } else {
-            //         SYCL_CHECK(cudaDeviceDisablePeerAccess(id_other));
+            //         SYCL_CHECK(syclDeviceDisablePeerAccess(id_other));
             //     }
             // }
         }
@@ -9695,9 +9681,9 @@ static void ggml_cuda_set_peer_access(const int n_tokens) {
     peer_access_enabled = enable_peer_access;
 }
 
-static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
+static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                 ggml_cuda_op_mul_mat_t op,
+                                 ggml_sycl_op_mul_mat_t op,
                                  const bool convert_src1_to_q8_1) try {
 
     const int64_t ne00 = src0->ne[0];
@@ -9748,19 +9734,19 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
     GGML_ASSERT(!(split && ne02 < ne12));
 
     // dd = data device
-    char  *  src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
-    float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
-    char  * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
-    float *   dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
+    char  *  src0_dd[GGML_SYCL_MAX_DEVICES] = {nullptr};
+    float * src1_ddf[GGML_SYCL_MAX_DEVICES] = {nullptr}; // float
+    char  * src1_ddq[GGML_SYCL_MAX_DEVICES] = {nullptr}; // q8_1
+    float *   dst_dd[GGML_SYCL_MAX_DEVICES] = {nullptr};
 
     // as = actual size
-    size_t  src0_as[GGML_CUDA_MAX_DEVICES] = {0};
-    size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
-    size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
-    size_t   dst_as[GGML_CUDA_MAX_DEVICES] = {0};
+    size_t  src0_as[GGML_SYCL_MAX_DEVICES] = {0};
+    size_t src1_asf[GGML_SYCL_MAX_DEVICES] = {0};
+    size_t src1_asq[GGML_SYCL_MAX_DEVICES] = {0};
+    size_t   dst_as[GGML_SYCL_MAX_DEVICES] = {0};
 
-    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
-    int64_t row_high[GGML_CUDA_MAX_DEVICES];
+    int64_t  row_low[GGML_SYCL_MAX_DEVICES];
+    int64_t row_high[GGML_SYCL_MAX_DEVICES];
 
     int used_devices = 0;
 
@@ -9800,27 +9786,27 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
         const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
         const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
 
-        ggml_cuda_set_device(id);
-        const dpct::queue_ptr stream = g_cudaStreams[id][0];
+        ggml_sycl_set_device(id);
+        const dpct::queue_ptr stream = g_syclStreams[id][0];
 
         if (src0_on_device && src0_is_contiguous) {
             src0_dd[id] = (char *) src0_extra->data_device[id];
         } else {
             // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
-            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
+            src0_dd[id] = (char *) ggml_sycl_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
         }
 
         if (src1_on_device && src1_is_contiguous) {
             src1_ddf[id] = (float *) src1_extra->data_device[id];
         } else {
-            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
+            src1_ddf[id] = (float *) ggml_sycl_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
         }
 
         if (convert_src1_to_q8_1) {
-            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
+            src1_ddq[id] = (char *) ggml_sycl_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
 
             if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
+                quantize_row_q8_1_sycl(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
                 /*
                 DPCT1010:90: SYCL uses exceptions to report errors and does not
                 use the error codes. The call was replaced with 0. You need to
@@ -9834,14 +9820,14 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
             dst_dd[id] = (float *) dst_extra->data_device[id];
         } else {
             const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
-            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
+            dst_dd[id] = (float *) ggml_sycl_pool_malloc(size_dst_ddf, &dst_as[id]);
         }
     }
 
     // if multiple devices are used they need to wait for the main device
     // here an event is recorded that signals that the main device has finished calculating the input data
     if (split && used_devices > 1) {
-        SYCL_CHECK(ggml_cuda_set_device(g_main_device));
+        SYCL_CHECK(ggml_sycl_set_device(g_main_device));
         /*
         DPCT1024:91: The original code returned the error code that was further
         consumed by the program logic. This original code was replaced with 0.
@@ -9849,7 +9835,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
         */
         SYCL_CHECK(CHECK_TRY_ERROR(
             *src0_extra->events[g_main_device_index][0] =
-                g_cudaStreams[g_main_device_index][0]->ext_oneapi_submit_barrier()));
+                g_syclStreams[g_main_device_index][0]->ext_oneapi_submit_barrier()));
     }
 
     const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
@@ -9866,8 +9852,8 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
             const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
             const int64_t row_diff = row_high[id] - row_low[id];
 
-            ggml_cuda_set_device(id);
-            const dpct::queue_ptr stream = g_cudaStreams[id][is];
+            ggml_sycl_set_device(id);
+            const dpct::queue_ptr stream = g_syclStreams[id][is];
 
             // wait for main GPU data if necessary
             if (split && (id != g_main_device_index || is != 0)) {
@@ -9911,14 +9897,14 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                         }
                     }
                 } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
-                    SYCL_CHECK(ggml_cuda_cpy_tensor_2d(
+                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(
                                    src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
                 } else {
                     GGML_ASSERT(false);
                 }
 
                 if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
+                    quantize_row_q8_1_sycl(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
                     /*
                     DPCT1010:92: SYCL uses exceptions to report errors and does
                     not use the error codes. The call was replaced with 0. You
@@ -9928,7 +9914,7 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
                 }
 
                 if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
-                    SYCL_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
+                    SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
                 }
 
                 // do the computation
@@ -9997,20 +9983,20 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
         if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) {
             continue;
         }
-        SYCL_CHECK(ggml_cuda_set_device(id));
+        SYCL_CHECK(ggml_sycl_set_device(id));
 
         // free buffers again when done
         if (dst_as[id] > 0) {
-            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
+            ggml_sycl_pool_free(dst_dd[id], dst_as[id]);
         }
         if (src1_asq[id] > 0) {
-            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
+            ggml_sycl_pool_free(src1_ddq[id], src1_asq[id]);
         }
         if (src1_asf[id] > 0) {
-            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
+            ggml_sycl_pool_free(src1_ddf[id], src1_asf[id]);
         }
         if (src0_as[id] > 0) {
-            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
+            ggml_sycl_pool_free(src0_dd[id], src0_as[id]);
         }
     }
 
@@ -10019,21 +10005,21 @@ static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
         int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
         is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
 
-        SYCL_CHECK(ggml_cuda_set_device(g_main_device));
+        SYCL_CHECK(ggml_sycl_set_device(g_main_device));
         for (int64_t id = 0; id < g_device_count; ++id) {
             if (row_low[id] == row_high[id]) {
                 continue;
             }
             for (int64_t is = 0; is < is_max; ++is) {
                 SYCL_CHECK(CHECK_TRY_ERROR(
-                    g_cudaStreams[g_main_device_index][0]->ext_oneapi_submit_barrier(
+                    g_syclStreams[g_main_device_index][0]->ext_oneapi_submit_barrier(
                         {*src0_extra->events[id][is]})));
             }
         }
     }
 
     if (dst->backend == GGML_BACKEND_CPU) {
-        SYCL_CHECK(ggml_cuda_set_device(g_main_device));
+        SYCL_CHECK(ggml_sycl_set_device(g_main_device));
         SYCL_CHECK(CHECK_TRY_ERROR(
             dpct::get_current_device().queues_wait_and_throw()));
     }
@@ -10044,103 +10030,103 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_repeat);
 }
 
-static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_get_rows);
 }
 
-static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_add);
 }
 
-static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_acc);
 }
 
-static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_mul);
 }
 
-static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_div);
 }
 
-static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_gelu);
 }
 
-static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_silu);
 }
 
-static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_gelu_quick);
 }
 
-static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_tanh);
 }
 
-static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_relu);
 }
 
-static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_leaky_relu);
 }
 
-static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_sqr);
 }
 
-static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_norm);
 }
 
-static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_group_norm);
 }
 
-static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_concat);
 }
 
-static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_upscale);
 }
 
-static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pad);
 }
 
-static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rms_norm);
 }
 
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (!g_cublas_loaded) return false;
+bool ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (!g_sycl_loaded) return false;
 
     const int64_t ne10 = src1->ne[0];
 
@@ -10154,7 +10140,7 @@ bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_te
             (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
 }
 
-static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
+static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0,
                                        const ggml_tensor *src1,
                                        ggml_tensor *dst) try {
     GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
@@ -10170,8 +10156,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
 
     const int64_t ne12 = src1->ne[2];
 
-    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
+    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device_index];
@@ -10182,7 +10168,7 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
     ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
 
-    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
+    ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -10190,7 +10176,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
+static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0,
                                      const ggml_tensor *src1,
                                      ggml_tensor *dst) try {
     GGML_ASSERT(!ggml_is_transposed(src0));
@@ -10209,8 +10195,8 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
 
     const int64_t ne12 = src1->ne[2];
 
-    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
+    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device_index];
@@ -10224,7 +10210,7 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
     const int64_t row_stride_x = nb01 / sizeof(sycl::half);
     const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
 
-    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
+    ggml_mul_mat_vec_nc_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -10257,7 +10243,7 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
     ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2   + i13*nbd3;
 }
 
-static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
+static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
                                                  const ggml_tensor *src1,
                                                  ggml_tensor *dst) try {
     GGML_ASSERT(!ggml_is_transposed(src0));
@@ -10288,11 +10274,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
     const int64_t ne1 = ggml_nelements(src1);
     const int64_t ne  = ggml_nelements(dst);
 
-    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
+    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
 
     SYCL_CHECK(
-        CHECK_TRY_ERROR(g_cublas_handles[g_main_device_index] = main_stream));
+        CHECK_TRY_ERROR(g_sycl_handles[g_main_device_index] = main_stream));
 
     ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     void * src0_ddq = src0_extra->data_device[g_main_device_index];
@@ -10305,13 +10291,13 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
 
     // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
+    const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+    GGML_ASSERT(to_fp16_sycl != nullptr);
 
-    cuda_pool_alloc<sycl::half> src1_as_f16(ne1);
-    to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
+    sycl_pool_alloc<sycl::half> src1_as_f16(ne1);
+    to_fp16_sycl(src1_ddf, src1_as_f16.get(), ne1, main_stream);
 
-    cuda_pool_alloc<sycl::half> dst_f16;
+    sycl_pool_alloc<sycl::half> dst_f16;
     char * dst_t;
 
     dpct::library_data_t cu_compute_type = dpct::library_data_t::real_half;
@@ -10353,7 +10339,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
     const int64_t r3 = ne13/ne03;
 
 #if 0
-    // use cublasGemmEx
+    // use syclGemmEx
     {
         for (int i13 = 0; i13 < ne13; ++i13) {
             for (int i12 = 0; i12 < ne12; ++i12) {
@@ -10361,10 +10347,10 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
                 int i02 = i12 / r2;
 
                 SYCL_CHECK(
-                        cublasGemmEx(g_cublas_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N,
+                        syclGemmEx(g_sycl_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N,
                             ne01, ne11, ne10,
-                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
-                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
+                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , SYCL_R_16F,   nb01/sizeof(half),
+                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F,   nb11/sizeof(float),
                             beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
                             cu_compute_type,
                             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@@ -10374,9 +10360,9 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
 #else
     if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
         // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-        // use cublasGemmStridedBatchedEx
+        // use syclGemmStridedBatchedEx
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-            *g_cublas_handles[g_main_device_index], oneapi::mkl::transpose::trans,
+            *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const char *)src0_as_f16, dpct::library_data_t::real_half,
             nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
@@ -10385,11 +10371,11 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
             (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float),
             ne12 * ne13, cu_compute_type)));
     } else {
-        // use cublasGemmBatchedEx
+        // use syclGemmBatchedEx
         const int ne23 = ne12*ne13;
 
-        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
-        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);
+        sycl_pool_alloc<const void *> ptrs_src(2*ne23);
+        sycl_pool_alloc<      void *> ptrs_dst(1*ne23);
 
         sycl::range<3> block_dims(1, ne12, ne13);
         /*
@@ -10425,7 +10411,7 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
         SYCL_CHECK(0);
 
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-            *g_cublas_handles[g_main_device_index], oneapi::mkl::transpose::trans,
+            *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans,
             oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const void **)(ptrs_src.get() + 0 * ne23),
             dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
@@ -10437,8 +10423,8 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
 #endif
 
     if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
+        const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
+        to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream);
     }
 }
 catch (sycl::exception const &exc) {
@@ -10447,7 +10433,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool all_on_device =
         (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
         (src1->backend == GGML_BACKEND_GPU) &&
@@ -10462,10 +10448,10 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
         }
     }
 
-#ifdef CUDA_USE_TENSOR_CORES
-    const bool use_tensor_cores = true;
+#ifdef SYCL_USE_XMX
+    const bool use_xmx = true;
 #else
-    const bool use_tensor_cores = false;
+    const bool use_xmx = false;
 #endif
 
     // debug helpers
@@ -10476,48 +10462,51 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+    if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         // KQ single-batch
-        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
+        GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n");
+        ggml_sycl_mul_mat_vec_p021(src0, src1, dst);
+    } else if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
         // KQV single-batch
-        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+        GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
+        ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
+    } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
         // KQ + KQV multi-batch
-        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
+        GGML_SYCL_DEBUG("ggml_sycl_mul_mat_mat_batched_sycl\n");
+        ggml_sycl_mul_mat_mat_batched_sycl(src0, src1, dst);
     } else if (src0->type == GGML_TYPE_F32) {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+        GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
+        ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
-#ifdef GGML_CUDA_FORCE_DMMV
+        GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n");
+        if (src1->ne[1] == 1 && src0->ne[0] % GGML_SYCL_DMMV_X == 0) {
+#ifdef GGML_SYCL_FORCE_DMMV
             const bool use_mul_mat_vec_q = false;
 #else
             const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
-#endif // GGML_CUDA_FORCE_DMMV
+#endif // GGML_SYCL_FORCE_DMMV
 
             if (use_mul_mat_vec_q) {
                 // NOTE: this kernel does not support ggml_nrows(src1) > 1
-                // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_vec_q path\n");
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
+                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_vec_q path\n");
+                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_vec_q, true);
             } else {
-                // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_dequantize_mul_mat_vec path\n");
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
+                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_dequantize_mul_mat_vec path\n");
+                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
             }
         } else {
             bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
 
-            // when tensor cores are available, use them for large batch size
-            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
+            if (use_xmx && min_compute_capability >= CC_VOLTA && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
                 use_mul_mat_q = false;
             }
 
             if (use_mul_mat_q) {
-                // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_q path\n");
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
+                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_q path\n");
+                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
             } else {
-                // GGML_SYCL_DEBUG("ggml_cuda_mul_mat ggml_cuda_op_mul_mat_cublas path\n");
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
+                // GGML_SYCL_DEBUG("ggml_sycl_mul_mat ggml_sycl_op_mul_mat_sycl path\n");
+                ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
             }
         }
     } else {
@@ -10549,8 +10538,8 @@ static __global__ void k_compute_batched_ptrs_id(
     } else {
         src0_f16 = src0_as_f16;
         if (threadIdx.x == 0 && threadIdx.y == 0) {
-            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
-            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
+            const to_fp16_sycl_t to_fp16 = ggml_get_to_fp16_sycl(src0_type);
+            to_fp16(srcs_ar[i], src0_f16, src0_ne, syclStreamFireAndForget);
         }
     }
 
@@ -10569,7 +10558,7 @@ static __global__ void k_compute_batched_ptrs_id(
     ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
 }
 
-static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
+static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
     const struct ggml_tensor * ids = dst->src[0];
     const struct ggml_tensor * src1 = dst->src[1];
     const struct ggml_tensor * src00 = dst->src[2];
@@ -10603,10 +10592,10 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
     const int64_t ne1 = ggml_nelements(src1);
     const int64_t ne  = ggml_nelements(dst);
 
-    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    cudaStream_t main_stream = g_cudaStreams[g_main_device_index][0];
+    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
+    syclStream_t main_stream = g_syclStreams[g_main_device_index][0];
 
-    SYCL_CHECK(cublasSetStream(g_cublas_handles[g_main_device_index], main_stream));
+    SYCL_CHECK(syclSetStream(g_sycl_handles[g_main_device_index], main_stream));
 
     //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     //void * src0_ddq = src0_extra->data_device[g_main_device_index];
@@ -10619,15 +10608,15 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
     float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
 
     // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
+    const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type);
+    GGML_ASSERT(to_fp16_sycl != nullptr);
 
     size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
-    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
+    half * src1_as_f16 = (half *) ggml_sycl_pool_malloc(ne1 * sizeof(half), &src1_as);
+    to_fp16_sycl(src1_ddf, src1_as_f16, ne1, main_stream);
 
     size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
+    half * dst_f16 = (half *) ggml_sycl_pool_malloc(ne * sizeof(half), &dst_as);
 
     GGML_ASSERT(ne12 % ne02 == 0);
     GGML_ASSERT(ne13 % ne03 == 0);
@@ -10639,7 +10628,7 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
     const half alpha_f16 = 1.0f;
     const half beta_f16  = 0.0f;
 
-    // use cublasGemmBatchedEx
+    // use syclGemmBatchedEx
     const int ne23 = ne12*ne13;
 
     const void ** ptrs_src = nullptr;
@@ -10648,14 +10637,14 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
     size_t ptrs_src_s = 0;
     size_t ptrs_dst_s = 0;
 
-    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
+    ptrs_src = (const void **) ggml_sycl_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
+    ptrs_dst = (      void **) ggml_sycl_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
 
     int64_t src0_ne = ggml_nelements(src00);
     half * src0_as_f16 = nullptr;
     size_t src0_as = 0;
     if (src00->type != GGML_TYPE_F16) {
-        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
+        src0_as_f16 = (half *) ggml_sycl_pool_malloc(src0_ne * sizeof(half), &src0_as);
     }
 
     static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
@@ -10676,41 +10665,41 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
             dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device_index] : nullptr,
             dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device_index] : nullptr
     );
-    SYCL_CHECK(cudaGetLastError());
+    SYCL_CHECK(syclGetLastError());
 
     SYCL_CHECK(
-    cublasGemmBatchedEx(g_cublas_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N,
+    syclGemmBatchedEx(g_sycl_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N,
             ne01, ne11, ne10,
-            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
-                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
-            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
+            &alpha_f16, (const void **) (ptrs_src + 0*ne23), SYCL_R_16F, ne00,
+                        (const void **) (ptrs_src + 1*ne23), SYCL_R_16F, ne10,
+            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), SYCL_R_16F, ne01,
             ne23,
             CUBLAS_COMPUTE_16F,
             CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 
     if (src0_as != 0) {
-        ggml_cuda_pool_free(src0_as_f16, src0_as);
+        ggml_sycl_pool_free(src0_as_f16, src0_as);
     }
     if (ptrs_src_s != 0) {
-        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
+        ggml_sycl_pool_free(ptrs_src, ptrs_src_s);
     }
     if (ptrs_dst_s != 0) {
-        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
+        ggml_sycl_pool_free(ptrs_dst, ptrs_dst_s);
     }
 
-    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
+    const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16);
+    to_fp32_sycl(dst_f16, dst_ddf, ne, main_stream);
 
-    ggml_cuda_pool_free(src1_as_f16, src1_as);
-    ggml_cuda_pool_free(dst_f16, dst_as);
+    ggml_sycl_pool_free(src1_as_f16, src1_as);
+    ggml_sycl_pool_free(dst_f16, dst_as);
 }
 #endif
 
-static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
+static void ggml_sycl_mul_mat_id(const ggml_tensor *src0,
                                  const ggml_tensor *src1,
                                  ggml_tensor *dst) try {
 #if 0
-    ggml_cuda_mul_mat_id_cublas(dst);
+    ggml_sycl_mul_mat_id_sycl(dst);
     // TODO: mmq/mmv support
 #endif
 
@@ -10723,7 +10712,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
     std::vector<char> ids_host(ggml_nbytes(ids));
 
-    const dpct::queue_ptr stream = g_cudaStreams[g_main_device_index][0];
+    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
 
     if (ids->backend == GGML_BACKEND_GPU) {
         const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index];
@@ -10760,8 +10749,8 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
 
         for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
             //int32_t row_id;
-            //SYCL_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
-            //SYCL_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+            //SYCL_CHECK(syclMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), syclMemcpyDeviceToHost, g_syclStreams[g_main_device][0]));
+            //SYCL_CHECK(syclStreamSynchronize(g_syclStreams[g_main_device][0]));
 
             const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
 
@@ -10775,11 +10764,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
             dst_row_extra.data_device[g_main_device_index] = dst_original + i01*dst->nb[1];
             dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
 
-            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+            ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
         }
     } else {
-        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
-        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
+        sycl_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
+        sycl_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
 
         src1_row_extra.data_device[g_main_device_index] = src1_contiguous.get();
         dst_row_extra.data_device[g_main_device_index]  =  dst_contiguous.get();
@@ -10825,7 +10814,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
             dst_row.nb[2] = num_src1_rows*nb1;
             dst_row.nb[3] = num_src1_rows*nb1;
 
-            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
+            ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row);
 
             num_src1_rows = 0;
             for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
@@ -10855,15 +10844,15 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
+static void ggml_sycl_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_scale);
 }
 
-static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
+static void ggml_sycl_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_clamp);
 }
 
-static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
+static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
                           ggml_tensor *dst) try {
     const int64_t ne = ggml_nelements(src0);
     GGML_ASSERT(ne == ggml_nelements(src1));
@@ -10890,8 +10879,8 @@ static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
     const int64_t nb11 = src1->nb[1];
     const int64_t nb12 = src1->nb[2];
 
-    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device_index][0];
+    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
+    dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
 
     const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
     const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
@@ -10900,17 +10889,17 @@ static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
     char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index];
 
     if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_q8_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+        ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else {
         fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -10925,44 +10914,44 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     // TODO: why do we pass dst as src1 here?
-    ggml_cuda_cpy(src0, dst, nullptr);
+    ggml_sycl_cpy(src0, dst, nullptr);
     (void) src1;
 }
 
-static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
+static void ggml_sycl_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_diag_mask_inf);
 }
 
-static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
+static void ggml_sycl_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_soft_max);
 }
 
-static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
 }
 
-static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
+static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
 }
 
-static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
+static void ggml_sycl_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_im2col);
 }
 
-static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_sum_rows);
 }
 
-static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
+    ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_argsort);
 }
 
-static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_sycl_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     (void) src0;
     (void) src1;
     (void) dst;
@@ -10974,7 +10963,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl
     return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
 }
 
-void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
+void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
     const int64_t nrows = ggml_nrows(tensor);
 
     const int64_t ne0 = tensor->ne[0];
@@ -10990,7 +10979,7 @@ void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
             continue;
         }
 
-        ggml_cuda_set_device(id);
+        ggml_sycl_set_device(id);
 
         int64_t row_low, row_high;
         if (backend == GGML_BACKEND_GPU) {
@@ -11061,7 +11050,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
+void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
     if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
         return;
     }
@@ -11070,14 +11059,14 @@ void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
 
     for (int64_t id = 0; id < g_device_count; ++id) {
         if (extra->data_device[id] != nullptr) {
-            SYCL_CHECK(ggml_cuda_set_device(id));
+            SYCL_CHECK(ggml_sycl_set_device(id));
             SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
                 extra->data_device[id], dpct::get_in_order_queue())));
         }
 
         for (int64_t is = 0; is < MAX_STREAMS; ++is) {
             if (extra->events[id][is] != nullptr) {
-                SYCL_CHECK(ggml_cuda_set_device(id));
+                SYCL_CHECK(ggml_sycl_set_device(id));
                 SYCL_CHECK(CHECK_TRY_ERROR(
                     dpct::destroy_event(extra->events[id][is])));
             }
@@ -11095,20 +11084,20 @@ catch (sycl::exception const &exc) {
 static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
 static size_t g_temp_tensor_extra_index = 0;
 
-static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+static ggml_tensor_extra_gpu * ggml_sycl_alloc_temp_tensor_extra() {
     if (g_temp_tensor_extras == nullptr) {
-        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
+        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_SYCL_MAX_NODES];
     }
 
     size_t alloc_index = g_temp_tensor_extra_index;
-    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
+    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_SYCL_MAX_NODES;
     ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
     memset(extra, 0, sizeof(*extra));
 
     return extra;
 }
 
-static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
+static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
                                           bool scratch, bool force_inplace,
                                           bool no_alloc) try {
     if (scratch && g_scratch_size == 0) {
@@ -11117,15 +11106,14 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
 
     tensor->backend = GGML_BACKEND_GPU;
 
-    // recursively assign CUDA buffers until a compute tensor is found
     if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
         const ggml_op src0_op = tensor->src[0]->op;
         if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
-            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
+            ggml_sycl_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
         }
     }
     if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
-        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
+        ggml_sycl_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
     }
 
     if (scratch && no_alloc) {
@@ -11139,7 +11127,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
         force_inplace;
     const size_t size = ggml_nbytes(tensor);
 
-    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
+    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
         ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
@@ -11147,12 +11135,12 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
         if (tensor->op == GGML_OP_VIEW) {
             memcpy(&offset, tensor->op_params, sizeof(size_t));
         }
-        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra = ggml_sycl_alloc_temp_tensor_extra();
         extra->data_device[g_main_device_index] = src0_ddc + offset;
     } else if (tensor->op == GGML_OP_CPY) {
         ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
         void * src1_ddv = src1_extra->data_device[g_main_device_index];
-        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra = ggml_sycl_alloc_temp_tensor_extra();
         extra->data_device[g_main_device_index] = src1_ddv;
     } else if (scratch) {
         GGML_ASSERT(size <= g_scratch_size);
@@ -11167,7 +11155,7 @@ static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
                     g_scratch_size, dpct::get_in_order_queue())));
             g_scratch_buffer = data;
         }
-        extra = ggml_cuda_alloc_temp_tensor_extra();
+        extra = ggml_sycl_alloc_temp_tensor_extra();
         extra->data_device[g_main_device_index] = data + g_scratch_offset;
 
         g_scratch_offset += size;
@@ -11192,19 +11180,19 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
+void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
                                      size_t offset) try {
     if (g_scratch_size == 0) {
         return;
     }
     if (g_scratch_buffer == nullptr) {
-        ggml_cuda_set_device(g_main_device);
+        ggml_sycl_set_device(g_main_device);
         SYCL_CHECK(
             CHECK_TRY_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
                                  g_scratch_size, dpct::get_in_order_queue())));
     }
 
-    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
+    ggml_tensor_extra_gpu * extra = ggml_sycl_alloc_temp_tensor_extra();
 
     const bool inplace = tensor->view_src != nullptr;
 
@@ -11228,12 +11216,12 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try {
+void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
     GGML_ASSERT(ggml_is_contiguous(tensor));
 
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-    SYCL_CHECK(ggml_cuda_set_device(g_main_device));
+    SYCL_CHECK(ggml_sycl_set_device(g_main_device));
     SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_in_order_queue()
                                     .memcpy(extra->data_device[g_main_device_index],
                                             tensor->data, ggml_nbytes(tensor))
@@ -11245,23 +11233,23 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
+void ggml_sycl_assign_buffers(struct ggml_tensor * tensor) {
+    ggml_sycl_assign_buffers_impl(tensor, true, false, false);
 }
 
-void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, true, false, true);
+void ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
+    ggml_sycl_assign_buffers_impl(tensor, true, false, true);
 }
 
-void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, false, false);
+void ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
+    ggml_sycl_assign_buffers_impl(tensor, false, false, false);
 }
 
-void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, true, false);
+void ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
+    ggml_sycl_assign_buffers_impl(tensor, false, true, false);
 }
 
-void ggml_cuda_set_main_device(const int main_device) try {
+void ggml_sycl_set_main_device(const int main_device) try {
 
     if (main_device >= g_all_sycl_device_count) {
         fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
@@ -11285,16 +11273,16 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-void ggml_cuda_set_scratch_size(const size_t scratch_size) {
+void ggml_sycl_set_scratch_size(const size_t scratch_size) {
     // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
     // it still won't always work as expected, but it's better than nothing
     if (scratch_size > g_scratch_size) {
-        ggml_cuda_free_scratch();
+        ggml_sycl_free_scratch();
     }
     g_scratch_size = std::max(g_scratch_size, scratch_size);
 }
 
-void ggml_cuda_free_scratch() try {
+void ggml_sycl_free_scratch() try {
     if (g_scratch_buffer == nullptr) {
         return;
     }
@@ -11309,10 +11297,10 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    if (!g_cublas_loaded) return false;
+bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
+    if (!g_sycl_loaded) return false;
 
-    ggml_cuda_func_t func;
+    ggml_sycl_func_t func;
     const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
         || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
         || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
@@ -11332,129 +11320,129 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
 
     switch (tensor->op) {
         case GGML_OP_REPEAT:
-            func = ggml_cuda_repeat;
+            func = ggml_sycl_repeat;
             break;
         case GGML_OP_GET_ROWS:
-            func = ggml_cuda_get_rows;
+            func = ggml_sycl_get_rows;
             break;
         case GGML_OP_DUP:
-            func = ggml_cuda_dup;
+            func = ggml_sycl_dup;
             break;
         case GGML_OP_ADD:
-            func = ggml_cuda_add;
+            func = ggml_sycl_add;
             break;
         case GGML_OP_ACC:
-            func = ggml_cuda_acc;
+            func = ggml_sycl_acc;
             break;
         case GGML_OP_MUL:
-            func = ggml_cuda_mul;
+            func = ggml_sycl_mul;
             break;
         case GGML_OP_DIV:
-            func = ggml_cuda_div;
+            func = ggml_sycl_div;
             break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(tensor)) {
                 case GGML_UNARY_OP_GELU:
-                    func = ggml_cuda_gelu;
+                    func = ggml_sycl_gelu;
                     break;
                 case GGML_UNARY_OP_SILU:
-                    func = ggml_cuda_silu;
+                    func = ggml_sycl_silu;
                     break;
                 case GGML_UNARY_OP_GELU_QUICK:
-                    func = ggml_cuda_gelu_quick;
+                    func = ggml_sycl_gelu_quick;
                     break;
                 case GGML_UNARY_OP_TANH:
-                    func = ggml_cuda_tanh;
+                    func = ggml_sycl_tanh;
                     break;
                 case GGML_UNARY_OP_RELU:
-                    func = ggml_cuda_relu;
+                    func = ggml_sycl_relu;
                     break;
                 default:
                     return false;
             }
             break;
         case GGML_OP_NORM:
-            func = ggml_cuda_norm;
+            func = ggml_sycl_norm;
             break;
         case GGML_OP_GROUP_NORM:
-            func = ggml_cuda_group_norm;
+            func = ggml_sycl_group_norm;
             break;
         case GGML_OP_CONCAT:
-            func = ggml_cuda_concat;
+            func = ggml_sycl_concat;
             break;
         case GGML_OP_UPSCALE:
-            func = ggml_cuda_upscale;
+            func = ggml_sycl_upscale;
             break;
         case GGML_OP_PAD:
-            func = ggml_cuda_pad;
+            func = ggml_sycl_pad;
             break;
         case GGML_OP_LEAKY_RELU:
-            func = ggml_cuda_leaky_relu;
+            func = ggml_sycl_leaky_relu;
             break;
         case GGML_OP_RMS_NORM:
-            func = ggml_cuda_rms_norm;
+            func = ggml_sycl_rms_norm;
             break;
         case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
+            if (!any_on_device && !ggml_sycl_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
                 return false;
             }
-            func = ggml_cuda_mul_mat;
+            func = ggml_sycl_mul_mat;
             break;
         case GGML_OP_MUL_MAT_ID:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
+            if (!any_on_device && !ggml_sycl_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
                 return false;
             }
-            func = ggml_cuda_mul_mat_id;
+            func = ggml_sycl_mul_mat_id;
             break;
         case GGML_OP_SCALE:
-            func = ggml_cuda_scale;
+            func = ggml_sycl_scale;
             break;
         case GGML_OP_SQR:
-            func = ggml_cuda_sqr;
+            func = ggml_sycl_sqr;
             break;
         case GGML_OP_CLAMP:
-            func = ggml_cuda_clamp;
+            func = ggml_sycl_clamp;
             break;
         case GGML_OP_CPY:
-            func = ggml_cuda_cpy;
+            func = ggml_sycl_cpy;
             break;
         case GGML_OP_CONT:
-            func = ggml_cuda_dup;
+            func = ggml_sycl_dup;
             break;
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
-            func = ggml_cuda_nop;
+            func = ggml_sycl_nop;
             break;
         case GGML_OP_DIAG_MASK_INF:
-            func = ggml_cuda_diag_mask_inf;
+            func = ggml_sycl_diag_mask_inf;
             break;
         case GGML_OP_SOFT_MAX:
-            func = ggml_cuda_soft_max;
+            func = ggml_sycl_soft_max;
             break;
         case GGML_OP_ROPE:
-            func = ggml_cuda_rope;
+            func = ggml_sycl_rope;
             break;
         case GGML_OP_ALIBI:
-            func = ggml_cuda_alibi;
+            func = ggml_sycl_alibi;
             break;
         case GGML_OP_IM2COL:
-            func = ggml_cuda_im2col;
+            func = ggml_sycl_im2col;
             break;
         case GGML_OP_SUM_ROWS:
-            func = ggml_cuda_sum_rows;
+            func = ggml_sycl_sum_rows;
             break;
         case GGML_OP_ARGSORT:
-            func = ggml_cuda_argsort;
+            func = ggml_sycl_argsort;
             break;
         default:
             return false;
     }
 
     if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
-        ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
+        ggml_sycl_set_peer_access(tensor->src[1]->ne[1]);
     }
 
     if (params->ith != 0) {
@@ -11467,7 +11455,7 @@ bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_
     return true;
 }
 
-int ggml_cuda_get_device_count() try {
+int ggml_sycl_get_device_count() try {
     int device_count;
     if (CHECK_TRY_ERROR(device_count =
                              dpct::dev_mgr::instance().device_count()) != 0) {
@@ -11481,7 +11469,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-void ggml_cuda_get_device_description(int device, char *description,
+void ggml_sycl_get_device_description(int device, char *description,
                                       size_t description_size) try {
     dpct::device_info prop;
     SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@@ -11500,27 +11488,27 @@ catch (sycl::exception const &exc) {
 
 #define UNUSED GGML_UNUSED
 
-// cuda buffer
+// sycl buffer
 
-struct ggml_backend_buffer_context_cuda {
+struct ggml_backend_buffer_context_sycl {
     int device;
     void * dev_ptr = nullptr;
     ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
     size_t temp_tensor_extra_index = 0;
 
-    ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
+    ggml_backend_buffer_context_sycl(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
 
-    ~ggml_backend_buffer_context_cuda() {
+    ~ggml_backend_buffer_context_sycl() {
         delete[] temp_tensor_extras;
     }
 
-    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+    ggml_tensor_extra_gpu * ggml_sycl_alloc_temp_tensor_extra() {
         if (temp_tensor_extras == nullptr) {
-            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
+            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_SYCL_MAX_NODES];
         }
 
         size_t alloc_index = temp_tensor_extra_index;
-        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
+        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_SYCL_MAX_NODES;
         ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
         memset(extra, 0, sizeof(*extra));
 
@@ -11529,8 +11517,8 @@ struct ggml_backend_buffer_context_cuda {
 };
 
 static void
-ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
+    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
     SYCL_CHECK(
         CHECK_TRY_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
     delete ctx;
@@ -11541,14 +11529,14 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
     return ctx->dev_ptr;
 }
 
-static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
+static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                                  ggml_tensor *tensor) try {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
 
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
@@ -11557,7 +11545,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
         return;
     }
 
-    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
+    ggml_tensor_extra_gpu * extra = ctx->ggml_sycl_alloc_temp_tensor_extra();
 
     extra->data_device[ctx->device] = tensor->data;
 
@@ -11574,7 +11562,7 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
         size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
 
         if (padded_size > original_size && tensor->view_src == nullptr) {
-            SYCL_CHECK(CHECK_TRY_ERROR(g_cudaStreams[ctx->device][0]->memset(
+            SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[ctx->device][0]->memset(
                 (char *)tensor->data + original_size, 0,
                 padded_size - original_size)));
         }
@@ -11588,15 +11576,15 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer,
+static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                 ggml_tensor *tensor,
                                                 const void *data, size_t offset,
                                                 size_t size) try {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
 
-    ggml_cuda_set_device(ctx->device);
+    ggml_sycl_set_device(ctx->device);
     SYCL_CHECK(
         CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
@@ -11611,15 +11599,15 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer,
+static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                 const ggml_tensor *tensor,
                                                 void *data, size_t offset,
                                                 size_t size) try {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
 
-    ggml_cuda_set_device(ctx->device);
+    ggml_sycl_set_device(ctx->device);
     SYCL_CHECK(
         CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
@@ -11634,11 +11622,11 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer,
+static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
                                            uint8_t value) try {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
 
-    ggml_cuda_set_device(ctx->device);
+    ggml_sycl_set_device(ctx->device);
     SYCL_CHECK(
         CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
@@ -11652,35 +11640,35 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
+static struct ggml_backend_buffer_i sycl_backend_buffer_interface = {
+    /* .free_buffer     = */ ggml_backend_sycl_buffer_free_buffer,
+    /* .get_base        = */ ggml_backend_sycl_buffer_get_base,
+    /* .init_tensor     = */ ggml_backend_sycl_buffer_init_tensor,
+    /* .set_tensor      = */ ggml_backend_sycl_buffer_set_tensor,
+    /* .get_tensor      = */ ggml_backend_sycl_buffer_get_tensor,
     /* .cpy_tensor_from = */ NULL,
     /* .cpy_tensor_to   = */ NULL,
-    /* .clear           = */ ggml_backend_cuda_buffer_clear,
+    /* .clear           = */ ggml_backend_sycl_buffer_clear,
 };
 
-// cuda buffer type
+// sycl buffer type
 
 static ggml_backend_buffer_t
-ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
+ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
                                            size_t size) try {
     int device = (int) (intptr_t) buft->context;
 
-    ggml_cuda_set_device(device);
+    ggml_sycl_set_device(device);
 
-    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
+    size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
 
     void * dev_ptr;
     SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
                                     size, dpct::get_in_order_queue())));
 
-    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
+    ggml_backend_buffer_context_sycl * ctx = new ggml_backend_buffer_context_sycl(device, dev_ptr);
 
-    return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
+    return ggml_backend_buffer_init(buft, sycl_backend_buffer_interface, ctx, size);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -11688,13 +11676,13 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     return 128;
 
     UNUSED(buft);
 }
 
-static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
+static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
     int64_t row_low = 0;
     int64_t row_high = ggml_nrows(tensor);
     int64_t nrows_split = row_high - row_low;
@@ -11714,46 +11702,46 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t
     UNUSED(buft);
 }
 
-static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cuda(backend);
+static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+    return ggml_backend_is_sycl(backend);
 
     UNUSED(buft);
 }
 
-static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
-    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
-    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
+static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
+    /* .alloc_buffer     = */ ggml_backend_sycl_buffer_type_alloc_buffer,
+    /* .get_alignment    = */ ggml_backend_sycl_buffer_type_get_alignment,
+    /* .get_alloc_size   = */ ggml_backend_sycl_buffer_type_get_alloc_size,
+    /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
     /* .is_host          = */ nullptr,
 };
 
-ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
+ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
+    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_types[GGML_SYCL_MAX_DEVICES];
 
-    static bool ggml_backend_cuda_buffer_type_initialized = false;
+    static bool ggml_backend_sycl_buffer_type_initialized = false;
 
-    if (!ggml_backend_cuda_buffer_type_initialized) {
-        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
-            ggml_backend_cuda_buffer_types[i] = {
-                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
+    if (!ggml_backend_sycl_buffer_type_initialized) {
+        for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) {
+            ggml_backend_sycl_buffer_types[i] = {
+                /* .iface    = */ ggml_backend_sycl_buffer_type_interface,
                 /* .context  = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
             };
         }
-        ggml_backend_cuda_buffer_type_initialized = true;
+        ggml_backend_sycl_buffer_type_initialized = true;
     }
 
-    return &ggml_backend_cuda_buffer_types[device];
+    return &ggml_backend_sycl_buffer_types[device];
 }
 
 // host buffer type
 
-static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_cuda_host_free(buffer->context);
+static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_sycl_host_free(buffer->context);
 }
 
-static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_cuda_host_malloc(size);
+static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * ptr = ggml_sycl_host_malloc(size);
 
     if (ptr == nullptr) {
         // fallback to cpu buffer
@@ -11763,15 +11751,15 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm
     // FIXME: this is a hack to avoid having to implement a new buffer type
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
     buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
+    buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
 
     return buffer;
 }
 
-ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
+ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
         /* .iface    = */ {
-            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
+            /* .alloc_buffer     = */ ggml_backend_sycl_host_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
             /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
@@ -11780,44 +11768,44 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
         /* .context  = */ nullptr,
     };
 
-    return &ggml_backend_cuda_buffer_type_host;
+    return &ggml_backend_sycl_buffer_type_host;
 }
 
 // backend
 
-struct ggml_backend_context_cuda {
+struct ggml_backend_context_sycl {
     int device;
 };
 
-static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
-    return GGML_CUDA_NAME;
+static const char * ggml_backend_sycl_name(ggml_backend_t backend) {
+    return GGML_SYCL_NAME;
 
     UNUSED(backend);
 }
 
-static void ggml_backend_cuda_free(ggml_backend_t backend) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+static void ggml_backend_sycl_free(ggml_backend_t backend) {
+    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
 
-    delete cuda_ctx;
+    delete sycl_ctx;
     delete backend;
 }
 
-static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
+    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
 
-    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
+    return ggml_backend_sycl_buffer_type(sycl_ctx->device);
 }
 
-static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
+static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
                                                ggml_tensor *tensor,
                                                const void *data, size_t offset,
                                                size_t size) try {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
 
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    SYCL_CHECK(CHECK_TRY_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
         (char *)tensor->data + offset, data, size)));
 }
 catch (sycl::exception const &exc) {
@@ -11826,16 +11814,16 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
+static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend,
                                                const ggml_tensor *tensor,
                                                void *data, size_t offset,
                                                size_t size) try {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
 
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
+    GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type");
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    SYCL_CHECK(CHECK_TRY_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
+    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy(
         data, (const char *)tensor->data + offset, size)));
 }
 catch (sycl::exception const &exc) {
@@ -11844,10 +11832,10 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try {
+    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
 
-    SYCL_CHECK(CHECK_TRY_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
+    SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait()));
 
     UNUSED(backend);
 }
@@ -11857,7 +11845,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static ggml_backend_graph_plan_t ggml_backend_sycl_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
     GGML_ASSERT(!"not implemented");
 
     return nullptr;
@@ -11866,24 +11854,24 @@ static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backen
     UNUSED(cgraph);
 }
 
-static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+static void ggml_backend_sycl_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     GGML_ASSERT(!"not implemented");
 
     UNUSED(backend);
     UNUSED(plan);
 }
 
-static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
     GGML_ASSERT(!"not implemented");
 
     UNUSED(backend);
     UNUSED(plan);
 }
 
-static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+static void ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+    ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
 
-    ggml_cuda_set_main_device(cuda_ctx->device);
+    ggml_sycl_set_main_device(sycl_ctx->device);
 
     ggml_compute_params params = {};
     params.type = GGML_TASK_COMPUTE;
@@ -11895,18 +11883,18 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
             continue;
 
         assert(node->backend == GGML_BACKEND_GPU);
-        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+        assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
         assert(node->extra != nullptr);
 
         for (int j = 0; j < GGML_MAX_SRC; j++) {
             if (node->src[j] != nullptr) {
                 assert(node->src[j]->backend == GGML_BACKEND_GPU);
-                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
+                assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device));
                 assert(node->src[j]->extra != nullptr);
             }
         }
 
-        bool ok = ggml_cuda_compute_forward(&params, node);
+        bool ok = ggml_sycl_compute_forward(&params, node);
         if (!ok) {
             fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
@@ -11914,9 +11902,9 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
 
 #if 0
         if (node->type == GGML_TYPE_F32) {
-            cudaDeviceSynchronize();
+            syclDeviceSynchronize();
             std::vector<float> tmp(ggml_nelements(node), 0.0f);
-            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
+            syclMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), syclMemcpyDeviceToHost);
             printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
                 ggml_type_name(node->src[0]->type),
                 node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
@@ -11939,7 +11927,7 @@ static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph
     UNUSED(backend);
 }
 
-static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
+static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
     switch (op->op) {
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(op)) {
@@ -12046,65 +12034,65 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten
     UNUSED(backend);
 }
 
-static ggml_backend_i cuda_backend_i = {
-    /* .get_name                = */ ggml_backend_cuda_name,
-    /* .free                    = */ ggml_backend_cuda_free,
-    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
-    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
-    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
+static ggml_backend_i sycl_backend_i = {
+    /* .get_name                = */ ggml_backend_sycl_name,
+    /* .free                    = */ ggml_backend_sycl_free,
+    /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
+    /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
+    /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
     /* .cpy_tensor_from_async   = */ NULL,
     /* .cpy_tensor_to_async     = */ NULL,
-    /* .synchronize             = */ ggml_backend_cuda_synchronize,
-    /* .graph_plan_create       = */ ggml_backend_cuda_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cuda_graph_plan_free,
-    /* .graph_plan_compute      = */ ggml_backend_cuda_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
-    /* .supports_op             = */ ggml_backend_cuda_supports_op,
+    /* .synchronize             = */ ggml_backend_sycl_synchronize,
+    /* .graph_plan_create       = */ ggml_backend_sycl_graph_plan_create,
+    /* .graph_plan_free         = */ ggml_backend_sycl_graph_plan_free,
+    /* .graph_plan_compute      = */ ggml_backend_sycl_graph_plan_compute,
+    /* .graph_compute           = */ ggml_backend_sycl_graph_compute,
+    /* .supports_op             = */ ggml_backend_sycl_supports_op,
 };
 
-ggml_backend_t ggml_backend_cuda_init(int device) {
-    ggml_init_cublas(); // TODO: remove from ggml.c
+ggml_backend_t ggml_backend_sycl_init(int device) {
+    ggml_init_sycl(); // TODO: remove from ggml.c
 
-    if (device < 0 || device >= ggml_cuda_get_device_count()) {
+    if (device < 0 || device >= ggml_sycl_get_device_count()) {
         fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
         return nullptr;
     }
 
     // not strictly necessary, but it may reduce the overhead of the first graph_compute
-    ggml_cuda_set_main_device(device);
+    ggml_sycl_set_main_device(device);
 
-    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
+    ggml_backend_context_sycl * ctx = new ggml_backend_context_sycl {
         /* .device = */ device
     };
 
-    ggml_backend_t cuda_backend = new ggml_backend {
-        /* .interface = */ cuda_backend_i,
+    ggml_backend_t sycl_backend = new ggml_backend {
+        /* .interface = */ sycl_backend_i,
         /* .context   = */ ctx
     };
 
-    return cuda_backend;
+    return sycl_backend;
 }
 
-bool ggml_backend_is_cuda(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_cuda_name;
+bool ggml_backend_is_sycl(ggml_backend_t backend) {
+    return backend->iface.get_name == ggml_backend_sycl_name;
 }
 
-static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
-    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
-    return cuda_backend;
+static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
+    ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
+    return sycl_backend;
 
     UNUSED(params);
 }
 
-extern "C" int ggml_backend_cuda_reg_devices();
+extern "C" int ggml_backend_sycl_reg_devices();
+
+int ggml_backend_sycl_reg_devices() {
+    int device_count = ggml_sycl_get_device_count();
 
-int ggml_backend_cuda_reg_devices() {
-    int device_count = ggml_cuda_get_device_count();
-    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
     for (int i = 0; i < device_count; i++) {
         char name[128];
-        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
-        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
+        snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i);
+        ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i);
     }
     return device_count;
 }
diff --git a/ggml-sycl.hpp b/ggml-sycl.hpp
index 235bbbd8a2a72..6fc54bc8adb52 100644
--- a/ggml-sycl.hpp
+++ b/ggml-sycl.hpp
@@ -1,19 +1,59 @@
-#include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
-// typedef sycl::half ggml_fp16_t;
-
-#define CHECK_TRY_ERROR(expr)                                                  \
-  [&]() {                                                                      \
-    try {                                                                      \
-      expr;                                                                    \
-      return dpct::success;                                                    \
-    } catch (std::exception const &e) {                                        \
-      std::cerr << e.what()<< "\nException caught at file:" << __FILE__        \
-        << ", line:" << __LINE__ <<", func:"<<__func__<< std::endl;            \
-      return dpct::default_error;                                              \
-    }                                                                          \
-  }()
-
-// #define DEBUG_CUDA_MALLOC
-
-int get_main_device();
\ No newline at end of file
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_SYCL_MAX_DEVICES       16
+#define GGML_SYCL_NAME "SYCL"
+
+// Always success. To check if SYCL is actually loaded, use `ggml_sycl_loaded`.
+GGML_API void   ggml_init_sycl(void);
+
+// Returns `true` if there are available SYCL devices and cublas loads successfully; otherwise, it returns `false`.
+GGML_API bool   ggml_sycl_loaded(void);
+
+GGML_API void * ggml_sycl_host_malloc(size_t size);
+GGML_API void   ggml_sycl_host_free(void * ptr);
+
+GGML_API bool   ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+GGML_API void   ggml_sycl_set_tensor_split(const float * tensor_split);
+GGML_API void   ggml_sycl_transform_tensor(void * data, struct ggml_tensor * tensor);
+GGML_API void   ggml_sycl_free_data(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_sycl_assign_buffers(struct ggml_tensor * tensor);
+GGML_API void   ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+GGML_API void   ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor);
+GGML_API void   ggml_sycl_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
+GGML_API void   ggml_sycl_copy_to_device(struct ggml_tensor * tensor);
+
+GGML_API void   ggml_sycl_set_main_device(int main_device);
+GGML_API void   ggml_sycl_set_mul_mat_q(bool mul_mat_q);
+GGML_API void   ggml_sycl_set_scratch_size(size_t scratch_size);
+GGML_API void   ggml_sycl_free_scratch(void);
+GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
+
+GGML_API int    ggml_sycl_get_device_count(void);
+GGML_API void   ggml_sycl_get_device_description(int device, char * description, size_t description_size);
+
+// backend API
+GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
+
+GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
+GGML_API int  ggml_backend_sycl_get_device(ggml_backend_t backend);
+
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
+
+// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
+GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+
+int get_main_device();
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/ggml.c b/ggml.c
index f85045c9c40ee..c4bec02871b74 100644
--- a/ggml.c
+++ b/ggml.c
@@ -248,6 +248,8 @@ inline static void * ggml_aligned_malloc(size_t size) {
 #include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
+#elif defined(GGML_USE_SYCL)
+#include "ggml-sycl.hpp"
 #endif
 
 // floating point type used to accumulate sums
@@ -2293,6 +2295,8 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
         ggml_init_cublas();
 #elif defined(GGML_USE_CLBLAST)
         ggml_cl_init();
+#elif defined(GGML_USE_SYCL)
+        ggml_init_sycl();
 #endif
 
         ggml_setup_op_has_task_pass();
@@ -14687,6 +14691,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
     GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
 #endif // GGML_USE_CUBLAS
 
+#ifdef GGML_USE_SYCL
+    bool skip_cpu = ggml_sycl_compute_forward(params, tensor);
+    if (skip_cpu) {
+        return;
+    }
+    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
+    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
+#endif // GGML_USE_SYCL
     switch (tensor->op) {
         case GGML_OP_DUP:
             {
@@ -20263,7 +20275,7 @@ int ggml_cpu_has_wasm_simd(void) {
 }
 
 int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
     return 1;
 #else
     return 0;
@@ -20286,8 +20298,16 @@ int ggml_cpu_has_clblast(void) {
 #endif
 }
 
+int ggml_cpu_has_sycl(void) {
+#if defined(GGML_USE_SYCL)
+    return 1;
+#else
+    return 0;
+#endif
+}
+
 int ggml_cpu_has_gpublas(void) {
-    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast();
+    return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_sycl();
 }
 
 int ggml_cpu_has_sse3(void) {
diff --git a/llama.cpp b/llama.cpp
index 276647be330c9..e1e7a56a2658b 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11,9 +11,7 @@
 #  include "ggml-cuda.h"
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
-#endif
-
-#ifdef GGML_USE_SYCL
+#elif defined(GGML_USE_SYCL)
 #  include "ggml-sycl.hpp"
 #endif
 
@@ -1260,6 +1258,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
     if (host_buffer) {
         buft = ggml_backend_cuda_host_buffer_type();
     }
+#elif defined(GGML_USE_SYCL)
+    buft = ggml_backend_sycl_host_buffer_type();
 #elif defined(GGML_USE_CPU_HBM)
     buft = ggml_backend_cpu_hbm_buffer_type();
 #endif
@@ -1279,6 +1279,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
     buft = ggml_backend_metal_buffer_type();
 #elif defined(GGML_USE_CUBLAS)
     buft = ggml_backend_cuda_buffer_type(gpu);
+#elif defined(GGML_USE_SYCL)
+    buft = ggml_backend_sycl_buffer_type(gpu);
 #elif defined(GGML_USE_CLBLAST)
     buft = ggml_backend_opencl_buffer_type();
 #endif
@@ -9935,6 +9937,15 @@ struct llama_context * llama_new_context_with_model(
                 }
             }
         }
+#elif defined(GGML_USE_SYCL)
+        if (model->n_gpu_layers > 0) {
+            ctx->backend = ggml_backend_sycl_init(0);
+            if (ctx->backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize SYCL backend\n", __func__);
+            }
+        }
+
+
 #endif
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {
diff --git a/llama.h b/llama.h
index bb60545575932..b083111fa261c 100644
--- a/llama.h
+++ b/llama.h
@@ -6,6 +6,9 @@
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
+#elif defined(GGML_USE_SYCL)
+#include "ggml-sycl.hpp"
+#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
 #else
 #define LLAMA_MAX_DEVICES 1
 #endif // GGML_USE_CUBLAS
@@ -46,7 +49,7 @@
 #define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 #define LLAMA_SESSION_VERSION 4
 
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_SYCL)
 // Defined when llama.cpp is compiled with support for offloading model layers to GPU.
 #define LLAMA_SUPPORTS_GPU_OFFLOAD
 #endif

From 69d76c8b58b98f9dac5c52af129367666be05d9d Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 8 Jan 2024 14:23:55 +0800
Subject: [PATCH 13/90] fix error of select non-zero device, format device list

---
 ggml-sycl.cpp | 60 +++++++++++++++++++++++++++++++--------------------
 1 file changed, 37 insertions(+), 23 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 870676c9631d9..bf36355d4078f 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -63,6 +63,10 @@ static int g_ggml_sycl_debug=0;
 
 static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 
+static void crash(){
+    int *ptr = NULL;
+    *ptr = 0;
+}
 
 static void ggml_sycl_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
     fprintf(stderr, "SYCL error: %s: %s\n", stmt, msg);
@@ -358,22 +362,21 @@ struct ggml_tensor_extra_gpu {
 
 inline dpct::err0 ggml_sycl_set_device(const int device) try {
     int current_device;
+
     SYCL_CHECK(CHECK_TRY_ERROR(
         current_device = dpct::dev_mgr::instance().current_device_id()));
 
+    // GGML_SYCL_DEBUG("ggml_sycl_set_device device=%d, current_device=%d\n", device, current_device);
     if (device == current_device) {
         return 0;
     }
 
-    /*
-    DPCT1093:53: The "device" device may be not the one intended for use. Adjust
-    the selected device if needed.
-    */
     return CHECK_TRY_ERROR(dpct::select_device(device));
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
             << ", line:" << __LINE__ << std::endl;
+  crash();
   std::exit(1);
 }
 
@@ -6391,6 +6394,13 @@ int get_device_index_by_id(int id){
     return res;
 }
 
+int get_device_id_by_index(int index){
+    int res = g_device_caps[index].device_id;
+    GGML_ASSERT(res>=0);
+    return res;
+}
+
+
 int get_current_device_index(){
     return get_device_index_by_id(dpct::dev_mgr::instance().current_device_id());
 }
@@ -8258,12 +8268,13 @@ bool ggml_sycl_loaded(void) {
 }
 void print_devices(){
     int device_count = dpct::dev_mgr::instance().device_count();
-    fprintf(stderr, "%s: found %d SYCL devices:\n", __func__, device_count);
+    fprintf(stderr, "\n%s: found %d SYCL devices:\n", __func__, device_count);
     for (int id = 0; id < device_count; ++id) {
         dpct::device_info prop;
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(id))));
-        fprintf(stderr, "  Device %d: %s, compute capability %d.%d, max compute_units %d, max work group size %d, max sub group size %d, global mem size %lu\n", id,
+        sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
+        fprintf(stderr, "  Device %d: %s,\tcompute capability %d.%d,\n\tmax compute_units %d,\tmax work group size %d,\tmax sub group size %d,\tglobal mem size %lu\n", id,
                 prop.get_name(), prop.get_major_version(),
                 prop.get_minor_version(),
                 prop.get_max_compute_units(),
@@ -8272,6 +8283,7 @@ void print_devices(){
                 prop.get_global_mem_size()
                 );
     }
+    fprintf(stderr, "\n");
 }
 
 int get_sycl_env(const char* env_name, int default_val){
@@ -8302,7 +8314,7 @@ void ggml_init_sycl() try {
 
         printf("GGML_SYCL_DEBUG=%d\n", g_ggml_sycl_debug);
 
-        int user_device_number = get_sycl_env("GGML_SYCL_DEVICE", 0);
+        int user_device_id = get_sycl_env("GGML_SYCL_DEVICE", 0);
 
         print_devices();
 
@@ -8341,7 +8353,7 @@ void ggml_init_sycl() try {
 
         int device_inx = -1;
         for (int id = 0; id < g_all_sycl_device_count; ++id) {
-            if(id!=user_device_number) continue;
+            if(id!=user_device_id) continue;
 
             device_inx++;
             int device_vmm = 0;
@@ -8369,14 +8381,14 @@ void ggml_init_sycl() try {
         }
         device_inx = -1;
         for (int id = 0; id < g_all_sycl_device_count; ++id) {
-            if(id!=user_device_number) continue;
+            if(id!=user_device_id) continue;
             device_inx++;
             g_tensor_split[device_inx] /= total_vram;
         }
 
         device_inx = -1;
         for (int id = 0; id < g_all_sycl_device_count; ++id) {
-            if(id!=user_device_number) continue;
+            if(id!=user_device_id) continue;
             device_inx++;
             SYCL_CHECK(ggml_sycl_set_device(id));
 
@@ -8406,9 +8418,9 @@ void ggml_init_sycl() try {
 
         //hardcode, force set to 1 device
         g_device_count = 1;
-        ggml_sycl_set_main_device(user_device_number);
-        ggml_sycl_set_device(user_device_number);
-        fprintf(stderr, "Using Device %d\n", user_device_number);
+        ggml_sycl_set_main_device(user_device_id);
+        ggml_sycl_set_device(user_device_id);
+        fprintf(stderr, "Using Device %d\n", user_device_id);
 
         // for (int id = 0; id < g_all_sycl_device_count; ++id) {
         //     GGML_SYCL_DEBUG("id=%d  g_device_caps[%d].device_id=%d g_sycl_device_id2index[%d].index=%d ", id, id,
@@ -9583,7 +9595,8 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
 
     ggml_sycl_set_device(g_main_device);
     dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-    GGML_SYCL_DEBUG("g_main_device_index=%d, src0=%p\n", g_main_device_index, src0);
+    GGML_SYCL_DEBUG("g_main_device_index=%d, src0=%p main_stream=%p src0_on_device=%d\n",
+        g_main_device_index, src0, main_stream, src0_on_device);
 
     if (src0_on_device) {
         src0_ddf = (float *) src0_extra->data_device[g_main_device_index];
@@ -9609,6 +9622,8 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
         dst_ddf = dst_f.alloc(ggml_nelements(dst));
     }
 
+    GGML_SYCL_DEBUG("op src0=%p, src1=%p, dst=%p, src0_ddf=%p, src1_ddf=%p, dst_ddf=%p, main_stream=%p\n",
+        src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
     // do the computation
     op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
     /*
@@ -9648,12 +9663,12 @@ static void ggml_sycl_set_peer_access(const int n_tokens) {
 
 #ifdef NDEBUG
     for (int id = 0; id < g_device_count; ++id) {
-        SYCL_CHECK(ggml_sycl_set_device(g_device_caps[id].device_id));
+        SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
         // SYCL_CHECK(syclDeviceSynchronize());
     }
 
     for (int id = 0; id < g_device_count; ++id) {
-        SYCL_CHECK(ggml_sycl_set_device(g_device_caps[id].device_id));
+        SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
         int device_id = g_device_caps[id].device_id;
 
         for (int id_other = 0; id_other < g_device_count; ++id_other) {
@@ -9786,7 +9801,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
         const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device_index;
         const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
 
-        ggml_sycl_set_device(id);
+        ggml_sycl_set_device(get_device_id_by_index(id));
         const dpct::queue_ptr stream = g_syclStreams[id][0];
 
         if (src0_on_device && src0_is_contiguous) {
@@ -9852,7 +9867,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
             const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device_index;
             const int64_t row_diff = row_high[id] - row_low[id];
 
-            ggml_sycl_set_device(id);
+            ggml_sycl_set_device(get_device_id_by_index(id));
             const dpct::queue_ptr stream = g_syclStreams[id][is];
 
             // wait for main GPU data if necessary
@@ -9983,7 +9998,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
         if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) {
             continue;
         }
-        SYCL_CHECK(ggml_sycl_set_device(id));
+        SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
 
         // free buffers again when done
         if (dst_as[id] > 0) {
@@ -10978,8 +10993,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
         if (backend == GGML_BACKEND_GPU && id != g_main_device_index) {
             continue;
         }
-
-        ggml_sycl_set_device(id);
+        ggml_sycl_set_device(get_device_id_by_index(id));
 
         int64_t row_low, row_high;
         if (backend == GGML_BACKEND_GPU) {
@@ -11059,14 +11073,14 @@ void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
 
     for (int64_t id = 0; id < g_device_count; ++id) {
         if (extra->data_device[id] != nullptr) {
-            SYCL_CHECK(ggml_sycl_set_device(id));
+            SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
             SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
                 extra->data_device[id], dpct::get_in_order_queue())));
         }
 
         for (int64_t is = 0; is < MAX_STREAMS; ++is) {
             if (extra->events[id][is] != nullptr) {
-                SYCL_CHECK(ggml_sycl_set_device(id));
+                SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
                 SYCL_CHECK(CHECK_TRY_ERROR(
                     dpct::destroy_event(extra->events[id][is])));
             }

From c709c3cb3706141ab1f02b22115369127ff316a9 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Tue, 9 Jan 2024 08:48:18 +0800
Subject: [PATCH 14/90] ren ggml-sycl.hpp -> ggml-sycl.h

---
 CMakeLists.txt               | 2 +-
 ggml-sycl.cpp                | 2 +-
 ggml-sycl.hpp => ggml-sycl.h | 4 ----
 ggml.c                       | 2 +-
 llama.cpp                    | 2 +-
 llama.h                      | 2 +-
 6 files changed, 5 insertions(+), 9 deletions(-)
 rename ggml-sycl.hpp => ggml-sycl.h (87%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce3a75d09b87a..af2dc8771be91 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -486,7 +486,7 @@ if (LLAMA_SYCL)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
 
-        set(GGML_HEADERS_SYCL ggml-cuda.h ggml.h ggml-sycl.hpp)
+        set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
         set(GGML_SOURCES_SYCL ggml-sycl.cpp)
 
         set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl  mkl_sycl_blas mkl_sycl_lapack mkl_sycl_dft mkl_sycl_sparse mkl_sycl_vm mkl_sycl_rng mkl_sycl_stats mkl_sycl_data_fitting mkl_intel_ilp64 mkl_tbb_thread)
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index bf36355d4078f..890620c923b3f 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -16,7 +16,7 @@
 #include <dpct/dpct.hpp>
 #include <dpct/blas_utils.hpp>
 #include <dpct/lib_common_utils.hpp>
-#include "ggml-sycl.hpp"
+#include "ggml-sycl.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
 
diff --git a/ggml-sycl.hpp b/ggml-sycl.h
similarity index 87%
rename from ggml-sycl.hpp
rename to ggml-sycl.h
index 6fc54bc8adb52..9530c54c2e851 100644
--- a/ggml-sycl.hpp
+++ b/ggml-sycl.h
@@ -10,10 +10,8 @@ extern "C" {
 #define GGML_SYCL_MAX_DEVICES       16
 #define GGML_SYCL_NAME "SYCL"
 
-// Always success. To check if SYCL is actually loaded, use `ggml_sycl_loaded`.
 GGML_API void   ggml_init_sycl(void);
 
-// Returns `true` if there are available SYCL devices and cublas loads successfully; otherwise, it returns `false`.
 GGML_API bool   ggml_sycl_loaded(void);
 
 GGML_API void * ggml_sycl_host_malloc(size_t size);
@@ -41,7 +39,6 @@ GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, s
 GGML_API int    ggml_sycl_get_device_count(void);
 GGML_API void   ggml_sycl_get_device_description(int device, char * description, size_t description_size);
 
-// backend API
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 
 GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
@@ -49,7 +46,6 @@ GGML_API int  ggml_backend_sycl_get_device(ggml_backend_t backend);
 
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 
-// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 
 int get_main_device();
diff --git a/ggml.c b/ggml.c
index c4bec02871b74..d59af30b30691 100644
--- a/ggml.c
+++ b/ggml.c
@@ -249,7 +249,7 @@ inline static void * ggml_aligned_malloc(size_t size) {
 #elif defined(GGML_USE_CLBLAST)
 #include "ggml-opencl.h"
 #elif defined(GGML_USE_SYCL)
-#include "ggml-sycl.hpp"
+#include "ggml-sycl.h"
 #endif
 
 // floating point type used to accumulate sums
diff --git a/llama.cpp b/llama.cpp
index e1e7a56a2658b..61bafc9d208d6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12,7 +12,7 @@
 #elif defined(GGML_USE_CLBLAST)
 #  include "ggml-opencl.h"
 #elif defined(GGML_USE_SYCL)
-#  include "ggml-sycl.hpp"
+#  include "ggml-sycl.h"
 #endif
 
 #ifdef GGML_USE_METAL
diff --git a/llama.h b/llama.h
index b083111fa261c..07253e1157fab 100644
--- a/llama.h
+++ b/llama.h
@@ -7,7 +7,7 @@
 #include "ggml-cuda.h"
 #define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
 #elif defined(GGML_USE_SYCL)
-#include "ggml-sycl.hpp"
+#include "ggml-sycl.h"
 #define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
 #else
 #define LLAMA_MAX_DEVICES 1

From fa3a58605b6ece12c009a9ff0d48f33a27f00e42 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Tue, 9 Jan 2024 09:37:54 +0800
Subject: [PATCH 15/90] clear CMAKE to rm unused lib and options

---
 CMakeLists.txt | 58 ++++++++++++--------------------------------------
 migrate.sh     | 18 ----------------
 2 files changed, 14 insertions(+), 62 deletions(-)
 delete mode 100755 migrate.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index af2dc8771be91..d355687702c05 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -448,59 +448,29 @@ endif()
 
 
 if (LLAMA_SYCL)
-    set(ENABLE_AOT ats)
-    if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "IntelLLVM")
-	    message(WARNING "${CMAKE_C_COMPILER_ID} Need IntelLLVM for SYCL")
-    endif()
-    if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "IntelLLVM")
-        message(WARNING "${CMAKE_CXX_COMPILER_ID} Need IntelLLVM for SYCL")
+    if ( NOT DEFINED ENV{ONEAPI_ROOT})
+	message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
     endif()
+    
+    #todo: AOT
 
     find_package(IntelSYCL REQUIRED)
 
-    # Check SYCL support by the compiler
-    check_cxx_compiler_flag("-fsycl" _fsycl_option)
-    if (_fsycl_option)
-	    #set (CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES} "/opt/intel/oneapi/compiler/2024.0/include")
-        CHECK_INCLUDE_FILE_CXX("sycl/sycl.hpp" _sycl_header "-fsycl")
-	    set (_sycl_header "/opt/intel/oneapi/compiler/2024.0/include/sycl/sycl.hpp")
-        if (NOT _sycl_header)
-            CHECK_INCLUDE_FILE_CXX("CL/sycl.hpp" _sycl_header_old "-fsycl")
-        endif()
-        if (_sycl_header OR _sycl_header_old)
-            set(_sycl_support TRUE)
-        endif()
-    endif()
+    #add_compile_definitions(GGML_SYCL_F16)
+    add_compile_definitions(GGML_USE_SYCL)
 
-    if (_sycl_support)
-	    #add_compile_definitions(GGML_USE_CUBLAS)
-	add_compile_definitions(GGML_USE_SYCL)
-	    #add_compile_definitions(GGML_SYCL_F16)
-	add_compile_options(-I./)
-        add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include)
-        add_compile_options(-I/opt/intel/oneapi/compiler/2024.0/include/sycl)
-        add_compile_options(-I/opt/intel/oneapi/dpcpp-ct/2024.0/include)
-        add_compile_options(-I/opt/intel/oneapi/2024.0/include)
+    add_compile_options(-I./) #include DPCT 
+    add_compile_options(-I/${ONEAPI_ROOT}/2024.0/include)
 
-	    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -Wno-narrowing")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -Wno-narrowing")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
 
-        set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
-        set(GGML_SOURCES_SYCL ggml-sycl.cpp)
+    set(GGML_HEADERS_SYCL ggml.h ggml-sycl.h)
+    set(GGML_SOURCES_SYCL ggml-sycl.cpp)
 
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl  mkl_sycl_blas mkl_sycl_lapack mkl_sycl_dft mkl_sycl_sparse mkl_sycl_vm mkl_sycl_rng mkl_sycl_stats mkl_sycl_data_fitting mkl_intel_ilp64 mkl_tbb_thread)
+    set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
 
-        #add_library(ggml-sycl OBJECT ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL})
-        #add_executable(${PROJECT_NAME} ${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL})
-        #target_link_libraries(ggml-sycl PRIVATE sycl)
-        #target_compile_options(${PROJECT_NAME} PRIVATE ${CMAKE_CXX_FLAGS})
-        #set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl)
-	#add_sycl_to_target({})
-
-    else()
-        message(FATAL_ERROR "SYCL Support is not present")
-    endif()
 endif()
 
 
diff --git a/migrate.sh b/migrate.sh
deleted file mode 100755
index 93697c6b01579..0000000000000
--- a/migrate.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-echo "modify the ggml-sycl.cpp to fix dpct result error"
-TARGET_FILE=ggml-sycl.cpp
-sed -i "s/CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());/id = dpct::dev_mgr::instance().current_device_id();/g" ${TARGET_FILE}
-sed -i "s/CUDA_CHECK(current_device = dpct::dev_mgr::instance().current_device_id());/current_device = dpct::dev_mgr::instance().current_device_id();/g" ${TARGET_FILE}
-sed -i "s/g_cublas_handles, oneapi::mkl::transpose::trans,/*g_cublas_handles[id], oneapi::mkl::transpose::trans,/g" ${TARGET_FILE}
-sed -i "s/cu_compute_type = CUBLAS_COMPUTE_16F;/cu_compute_type = dpct::library_data_t::real_half;/g" ${TARGET_FILE}
-sed -i "s/cu_compute_type = CUBLAS_COMPUTE_32F;/cu_compute_type = dpct::library_data_t::real_float;/g" ${TARGET_FILE}
-sed -i "s/tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs, stream_ct1);/tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);/g" ${TARGET_FILE}
-sed -i "s/cuGetErrorString(err, &err_str);/\/\/cuGetErrorString(err, &err_str);/g" ${TARGET_FILE}
-
-#set empty function
-#ggml_cuda_set_peer_access
-#ggml_cuda_pool_malloc_vmm
-#ggml_cuda_pool_free_vmm
-
-#replace tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs, stream_ct1);/tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
-
-echo "done"
\ No newline at end of file

From 3645f25d7459e393b1cc2a7e69a576c14c7dd337 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 10 Jan 2024 22:27:26 +0800
Subject: [PATCH 16/90] correct queue: rm dtct:get_queue

---
 ggml-sycl.cpp | 209 ++++++++++++++++++++++++++++++++++----------------
 ggml-sycl.h   |   1 +
 2 files changed, 146 insertions(+), 64 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 890620c923b3f..5b34289139f3e 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -10,6 +10,12 @@
 #include <stdio.h>
 #include <vector>
 #include <cmath>
+#include <iostream>
+#include <fstream>
+
+#include <stdio.h>
+#include <stdlib.h>
+
 
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
@@ -21,7 +27,6 @@
 #include "ggml-backend-impl.h"
 
 static int g_ggml_sycl_debug=0;
-
 #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
 
 #define CHECK_TRY_ERROR(expr)                                                  \
@@ -38,6 +43,7 @@ static int g_ggml_sycl_debug=0;
 
 // #define DEBUG_SYCL_MALLOC
 
+static int g_work_group_size = 0;
 // typedef sycl::half ggml_fp16_t;
 
 #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
@@ -351,7 +357,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA
 
 #define MAX_STREAMS 8
 static dpct::queue_ptr g_syclStreams[GGML_SYCL_MAX_DEVICES][MAX_STREAMS] = {
-    {&dpct::get_in_order_queue()}};
+    {0}};
 
 struct ggml_tensor_extra_gpu {
     void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split tensors
@@ -422,6 +428,49 @@ static void bad_arch(const sycl::stream &stream_ct1) {
     (void) bad_arch; // suppress unused function warning
 }
 
+//todo: debug for crash in some case
+void print_ggml_tensor(const char*name, struct ggml_tensor *src){
+    if(!g_ggml_sycl_debug) return;
+    char filename[1024];
+    sprintf(filename, "%s.txt", name);
+    printf("GGML Tensor:%s save to %s:\n", name, filename);
+
+    size_t total_size = ggml_nbytes(src);
+    const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
+    float *local_buf = NULL;
+    // printf("total_size %d2, src_on_device %d\n", total_size, src_on_device);
+    if(src_on_device) {
+        local_buf = (float *) ggml_sycl_host_malloc(total_size);
+        // printf("local buf %p size %d bytes\n", local_buf, total_size);
+        ggml_sycl_set_device(g_main_device);
+        dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
+
+        ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *)  src->extra;
+
+        // printf("zjy before memcpy local_buf=%p, src->data=%p\n", local_buf, src->data);
+        main_stream->memcpy(local_buf, src_extra->data_device[g_main_device_index], total_size);
+    }
+    else {
+        local_buf = (float *)src->data;
+        // printf("local buf from src-> data %p\n", local_buf);
+    }
+
+    size_t total_elements = ggml_nelements(src);
+    std::ofstream logfile;
+    logfile.open(filename);
+    // printf("local buf element %d\n", total_elements);
+    for(int i=0; i<total_elements; i++){
+        if((i+1)%20 ==0) logfile <<std::endl;
+        else logfile << local_buf[i] <<" ";
+    }
+    logfile <<std::endl;
+    logfile.close();
+
+    // printf("before free %p\n", local_buf);
+    if(src_on_device) ggml_sycl_host_free(local_buf);
+    // printf("free done\n");
+}
+
 static __dpct_inline__ float warp_reduce_sum(float x,
                                              const sycl::nd_item<3> &item_ct1) {
 #pragma unroll
@@ -661,9 +710,8 @@ static void sqr_f32(const float * x, float * dst, const int k,
     dst[i] = x[i] * x[i];
 }
 
-template <int block_size>
 static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
-                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum) {
+                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum, int block_size) {
     const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
                     item_ct1.get_local_id(1);
     const int tid = item_ct1.get_local_id(2);
@@ -764,9 +812,8 @@ static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00,
     }
 }
 
-template <int block_size>
 static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
-                           const sycl::nd_item<3> &item_ct1, float *s_sum) {
+                           const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
     int start = item_ct1.get_group(2) * group_size;
     int end = start + group_size;
 
@@ -842,9 +889,8 @@ static void group_norm_f32(const float * x, float * dst, const int group_size, c
     }
 }
 
-template <int block_size>
 static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
-                         const sycl::nd_item<3> &item_ct1, float *s_sum) {
+                         const sycl::nd_item<3> &item_ct1, float *s_sum, int block_size) {
     const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
                     item_ct1.get_local_id(1);
     const int tid = item_ct1.get_local_id(2);
@@ -5656,12 +5702,13 @@ static void norm_f32_sycl(const float *x, float *dst, const int ncols,
                                   block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                     [[intel::reqd_sub_group_size(32)]] {
-                        norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
-                                            s_sum_acc_ct1.get_pointer());
+                        norm_f32(x, dst, ncols, eps, item_ct1,
+                                            s_sum_acc_ct1.get_pointer(), WARP_SIZE);
                     });
         });
     } else {
-        const sycl::range<3> block_dims(1, 1, 1024);
+        const int work_group_size = g_work_group_size;
+        const sycl::range<3> block_dims(1, 1, work_group_size);
         /*
         DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
@@ -5676,8 +5723,8 @@ static void norm_f32_sycl(const float *x, float *dst, const int ncols,
                                   block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                     [[intel::reqd_sub_group_size(32)]] {
-                        norm_f32<1024>(x, dst, ncols, eps, item_ct1,
-                                       s_sum_acc_ct1.get_pointer());
+                        norm_f32(x, dst, ncols, eps, item_ct1,
+                                       s_sum_acc_ct1.get_pointer(), work_group_size);
                     });
         });
     }
@@ -5700,18 +5747,20 @@ static void group_norm_f32_sycl(const float *x, float *dst,
                                   block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                     [[intel::reqd_sub_group_size(32)]] {
-                        group_norm_f32<WARP_SIZE>(
+                        group_norm_f32(
                             x, dst, group_size, ne_elements, eps_ct4, item_ct1,
-                            s_sum_acc_ct1.get_pointer());
+                            s_sum_acc_ct1.get_pointer(), WARP_SIZE);
                     });
         });
     } else {
-        const sycl::range<3> block_dims(1, 1, 1024);
+        const int work_group_size = g_work_group_size;
+        const sycl::range<3> block_dims(1, 1, work_group_size);
         /*
         DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
+
         stream->submit([&](sycl::handler &cgh) {
             sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
                                                          cgh);
@@ -5723,9 +5772,9 @@ static void group_norm_f32_sycl(const float *x, float *dst,
                                   block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                     [[intel::reqd_sub_group_size(32)]] {
-                        group_norm_f32<1024>(x, dst, group_size, ne_elements,
+                        group_norm_f32(x, dst, group_size, ne_elements,
                                              eps_ct4, item_ct1,
-                                             s_sum_acc_ct1.get_pointer());
+                                             s_sum_acc_ct1.get_pointer(), work_group_size);
                     });
         });
     }
@@ -5789,12 +5838,13 @@ static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
                                   block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                     [[intel::reqd_sub_group_size(32)]] {
-                        rms_norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
-                                                s_sum_acc_ct1.get_pointer());
+                        rms_norm_f32(x, dst, ncols, eps, item_ct1,
+                                                s_sum_acc_ct1.get_pointer(), WARP_SIZE);
                     });
         });
     } else {
-        const sycl::range<3> block_dims(1, 1, 1024);
+        const int work_group_size = g_work_group_size;
+        const sycl::range<3> block_dims(1, 1, work_group_size);
         /*
         DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
@@ -5809,8 +5859,8 @@ static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
                                   block_dims),
                 [=](sycl::nd_item<3> item_ct1)
                     [[intel::reqd_sub_group_size(32)]] {
-                        rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1,
-                                           s_sum_acc_ct1.get_pointer());
+                        rms_norm_f32(x, dst, ncols, eps, item_ct1,
+                                           s_sum_acc_ct1.get_pointer(), work_group_size);
                     });
         });
     }
@@ -8067,7 +8117,7 @@ static void *ggml_sycl_pool_malloc_leg(size_t size, size_t *actual_size) try {
     int id;
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_index()));
-    GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg index %d\n", id);
+    // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg index %d\n", id);
 #ifdef DEBUG_SYCL_MALLOC
     int nnz = 0;
     size_t max_size = 0;
@@ -8110,9 +8160,11 @@ static void *ggml_sycl_pool_malloc_leg(size_t size, size_t *actual_size) try {
     void * ptr;
     size_t look_ahead_size = (size_t) (1.05 * size);
     look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+
+    const dpct::queue_ptr stream = g_syclStreams[id][0];
     SYCL_CHECK(
         CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
-                             look_ahead_size, dpct::get_in_order_queue())));
+                             look_ahead_size, *stream)));
     *actual_size = look_ahead_size;
     g_sycl_pool_size[id] += look_ahead_size;
 
@@ -8135,6 +8187,7 @@ static void ggml_sycl_pool_free_leg(void *ptr, size_t size) try {
     SYCL_CHECK(
         CHECK_TRY_ERROR(id = get_current_device_index()));
 
+    const dpct::queue_ptr stream = g_syclStreams[id][0];
     for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
         sycl_buffer& b = g_sycl_buffer_pool[id][i];
         if (b.ptr == nullptr) {
@@ -8144,7 +8197,7 @@ static void ggml_sycl_pool_free_leg(void *ptr, size_t size) try {
         }
     }
     fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_SYCL_BUFFERS\n");
-    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
+    SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *stream)));
     g_sycl_pool_size[id] -= size;
 }
 catch (sycl::exception const &exc) {
@@ -8236,7 +8289,7 @@ struct sycl_pool_alloc {
     T * alloc(size_t size) {
         GGML_ASSERT(ptr == nullptr);
         ptr = (T *) ggml_sycl_pool_malloc(size * sizeof(T), &this->actual_size);
-        GGML_SYCL_DEBUG("alloc %lu return %p actual size=%lu\n", size * sizeof(T), ptr, this->actual_size);
+        // GGML_SYCL_DEBUG("alloc %lu return %p actual size=%lu\n", size * sizeof(T), ptr, this->actual_size);
         return ptr;
     }
 
@@ -8299,6 +8352,14 @@ int get_sycl_env(const char* env_name, int default_val){
     return user_number;
 }
 
+int get_work_group_size(int user_device_id){
+    dpct::device_info prop;
+    dpct::get_device_info(
+        prop,
+        dpct::dev_mgr::instance().get_device(user_device_id));
+    return prop.get_max_work_group_size();
+}
+
 void ggml_init_sycl() try {
     static bool initialized = false;
 
@@ -8403,9 +8464,10 @@ void ggml_init_sycl() try {
                         dpct::get_current_device().create_queue()));
             }
 
+            const dpct::queue_ptr stream = g_syclStreams[device_inx][0];
             // create sycl handle
             SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[device_inx] =
-                                              &dpct::get_in_order_queue()));
+                                              stream));
             /*
             DPCT1027:89: The call to syclSetMathMode was replaced with 0
             because this functionality is redundant in SYCL.
@@ -8420,6 +8482,7 @@ void ggml_init_sycl() try {
         g_device_count = 1;
         ggml_sycl_set_main_device(user_device_id);
         ggml_sycl_set_device(user_device_id);
+        g_work_group_size = get_work_group_size(user_device_id);
         fprintf(stderr, "Using Device %d\n", user_device_id);
 
         // for (int id = 0; id < g_all_sycl_device_count; ++id) {
@@ -8468,6 +8531,7 @@ void *ggml_sycl_host_malloc(size_t size) try {
     }
 
     void * ptr = nullptr;
+    //allow to use dpct::get_in_order_queue() for host malloc
     dpct::err0 err = CHECK_TRY_ERROR(
         ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
     /*
@@ -8504,6 +8568,7 @@ catch (sycl::exception const &exc) {
 }
 
 void ggml_sycl_host_free(void *ptr) try {
+    //allow to use dpct::get_in_order_queue() for host malloc
     SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
 }
 catch (sycl::exception const &exc) {
@@ -9595,16 +9660,14 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
 
     ggml_sycl_set_device(g_main_device);
     dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-    GGML_SYCL_DEBUG("g_main_device_index=%d, src0=%p main_stream=%p src0_on_device=%d\n",
-        g_main_device_index, src0, main_stream, src0_on_device);
+    // GGML_SYCL_DEBUG("g_main_device_index=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n",
+        // g_main_device_index, main_stream, src0_on_device, src1_on_device, dst_on_device);
 
     if (src0_on_device) {
         src0_ddf = (float *) src0_extra->data_device[g_main_device_index];
     } else {
         src0_ddf = src0_f.alloc(ggml_nelements(src0));
-        GGML_SYCL_DEBUG("g_main_device_index=%d, src0_ddf=%p\n", g_main_device_index, src0_ddf);
-
-        GGML_SYCL_DEBUG("before ggml_sycl_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
+        // GGML_SYCL_DEBUG("before ggml_sycl_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0);
         SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
     }
 
@@ -9613,7 +9676,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
             src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
         } else {
             src1_ddf = src1_f.alloc(ggml_nelements(src1));
-            SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
+            // SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
         }
     }
     if (dst_on_device) {
@@ -9622,8 +9685,8 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
         dst_ddf = dst_f.alloc(ggml_nelements(dst));
     }
 
-    GGML_SYCL_DEBUG("op src0=%p, src1=%p, dst=%p, src0_ddf=%p, src1_ddf=%p, dst_ddf=%p, main_stream=%p\n",
-        src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
+    // GGML_SYCL_DEBUG("op src0=%p, src1=%p, dst=%p, src0_ddf=%p, src1_ddf=%p, dst_ddf=%p, main_stream=%p\n",
+        // src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
     // do the computation
     op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
     /*
@@ -9642,13 +9705,12 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
         SYCL_CHECK(CHECK_TRY_ERROR(
             dpct::get_current_device().queues_wait_and_throw()));
     }
+    // print_ggml_tensor("tensor", dst);
 }
 catch (sycl::exception const &exc) {
 
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
             << ", line:" << __LINE__ << std::endl;
-  int *foo = (int*)-1;
-  printf("%d\n", *foo);
   std::exit(1);
 }
 
@@ -10479,21 +10541,21 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 
     if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
         // KQ single-batch
-        GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n");
+        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_p021\n");
         ggml_sycl_mul_mat_vec_p021(src0, src1, dst);
     } else if (!split && all_on_device && !use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
         // KQV single-batch
-        GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
+        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n");
         ggml_sycl_mul_mat_vec_nc(src0, src1, dst);
     } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
         // KQ + KQV multi-batch
-        GGML_SYCL_DEBUG("ggml_sycl_mul_mat_mat_batched_sycl\n");
+        // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_mat_batched_sycl\n");
         ggml_sycl_mul_mat_mat_batched_sycl(src0, src1, dst);
     } else if (src0->type == GGML_TYPE_F32) {
-        GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
+        // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n");
         ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
     } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n");
+        // GGML_SYCL_DEBUG("ggml_is_quantized or GGML_TYPE_F16\n");
         if (src1->ne[1] == 1 && src0->ne[0] % GGML_SYCL_DMMV_X == 0) {
 #ifdef GGML_SYCL_FORCE_DMMV
             const bool use_mul_mat_vec_q = false;
@@ -10994,6 +11056,7 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
             continue;
         }
         ggml_sycl_set_device(get_device_id_by_index(id));
+        const dpct::queue_ptr stream = g_syclStreams[id][0];
 
         int64_t row_low, row_high;
         if (backend == GGML_BACKEND_GPU) {
@@ -11031,20 +11094,20 @@ void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try {
 
         char * buf;
         SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
-                                        size, dpct::get_in_order_queue())));
+                                        size, *stream)));
         char * buf_host = (char *)data + offset_split;
 
         // set padding to 0 to avoid possible NaN values
         if (size > original_size) {
             SYCL_CHECK(CHECK_TRY_ERROR(
-                dpct::get_in_order_queue()
-                    .memset(buf + original_size, 0, size - original_size)
-                    .wait()));
+                (*stream)
+                .memset(buf + original_size, 0, size - original_size)
+                .wait()));
         }
 
-        SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_in_order_queue()
-                                        .memcpy(buf, buf_host, original_size)
-                                        .wait()));
+        SYCL_CHECK(CHECK_TRY_ERROR((*stream)
+                                    .memcpy(buf, buf_host, original_size)
+                                    .wait()));
 
         extra->data_device[id] = buf;
 
@@ -11072,10 +11135,10 @@ void ggml_sycl_free_data(struct ggml_tensor *tensor) try {
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
 
     for (int64_t id = 0; id < g_device_count; ++id) {
+        const dpct::queue_ptr stream = g_syclStreams[id][0];
         if (extra->data_device[id] != nullptr) {
             SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id)));
-            SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(
-                extra->data_device[id], dpct::get_in_order_queue())));
+            SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(extra->data_device[id], *stream)));
         }
 
         for (int64_t is = 0; is < MAX_STREAMS; ++is) {
@@ -11142,6 +11205,8 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
     const size_t size = ggml_nbytes(tensor);
 
     SYCL_CHECK(ggml_sycl_set_device(g_main_device));
+    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
+
     if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
         ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
         char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index];
@@ -11166,7 +11231,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
         if (data == nullptr) {
             SYCL_CHECK(CHECK_TRY_ERROR(
                 data = (char *)sycl::malloc_device(
-                    g_scratch_size, dpct::get_in_order_queue())));
+                    g_scratch_size, *stream)));
             g_scratch_buffer = data;
         }
         extra = ggml_sycl_alloc_temp_tensor_extra();
@@ -11178,9 +11243,9 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor,
     } else { // allocate new buffers outside of scratch
         void * data;
         SYCL_CHECK(CHECK_TRY_ERROR(data = (void *)sycl::malloc_device(
-                                        size, dpct::get_in_order_queue())));
+                                        size, *stream)));
         SYCL_CHECK(CHECK_TRY_ERROR(
-            dpct::get_in_order_queue().memset(data, 0, size).wait()));
+            (*stream).memset(data, 0, size).wait()));
         extra = new ggml_tensor_extra_gpu;
         memset(extra, 0, sizeof(*extra));
         extra->data_device[g_main_device_index] = data;
@@ -11201,9 +11266,10 @@ void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor,
     }
     if (g_scratch_buffer == nullptr) {
         ggml_sycl_set_device(g_main_device);
+        const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
         SYCL_CHECK(
             CHECK_TRY_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
-                                 g_scratch_size, dpct::get_in_order_queue())));
+                                 g_scratch_size, *stream)));
     }
 
     ggml_tensor_extra_gpu * extra = ggml_sycl_alloc_temp_tensor_extra();
@@ -11236,7 +11302,8 @@ void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try {
 
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
     SYCL_CHECK(ggml_sycl_set_device(g_main_device));
-    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_in_order_queue()
+    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream)
                                     .memcpy(extra->data_device[g_main_device_index],
                                             tensor->data, ggml_nbytes(tensor))
                                     .wait()));
@@ -11300,9 +11367,11 @@ void ggml_sycl_free_scratch() try {
     if (g_scratch_buffer == nullptr) {
         return;
     }
+    ggml_sycl_set_device(g_main_device);
+    const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0];
 
     SYCL_CHECK(CHECK_TRY_ERROR(
-        sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
+        sycl::free(g_scratch_buffer, *stream)));
     g_scratch_buffer = nullptr;
 }
 catch (sycl::exception const &exc) {
@@ -11533,8 +11602,12 @@ struct ggml_backend_buffer_context_sycl {
 static void
 ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
     ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
+    ggml_sycl_set_device(ctx->device);
+    int device_index = get_device_index_by_id(ctx->device);
+    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
+
     SYCL_CHECK(
-        CHECK_TRY_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
+        CHECK_TRY_ERROR(sycl::free(ctx->dev_ptr, *stream)));
     delete ctx;
 }
 catch (sycl::exception const &exc) {
@@ -11599,11 +11672,13 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
     ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
 
     ggml_sycl_set_device(ctx->device);
+    int device_index = get_device_index_by_id(ctx->device);
+    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
     SYCL_CHECK(
         CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
     SYCL_CHECK(
-        CHECK_TRY_ERROR(dpct::get_in_order_queue()
+        CHECK_TRY_ERROR((*stream)
                              .memcpy((char *)tensor->data + offset, data, size)
                              .wait()));
 }
@@ -11622,11 +11697,14 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
     ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
 
     ggml_sycl_set_device(ctx->device);
+    int device_index = get_device_index_by_id(ctx->device);
+    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
+
     SYCL_CHECK(
         CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
     SYCL_CHECK(CHECK_TRY_ERROR(
-        dpct::get_in_order_queue()
+        (*stream)
             .memcpy(data, (const char *)tensor->data + offset, size)
             .wait()));
 }
@@ -11641,10 +11719,12 @@ static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
     ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
 
     ggml_sycl_set_device(ctx->device);
+    int device_index = get_device_index_by_id(ctx->device);
+    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
     SYCL_CHECK(
         CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw()));
 
-    SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_in_order_queue()
+    SYCL_CHECK(CHECK_TRY_ERROR((*stream)
                                     .memset(ctx->dev_ptr, value, buffer->size)
                                     .wait()));
 }
@@ -11673,12 +11753,13 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
     int device = (int) (intptr_t) buft->context;
 
     ggml_sycl_set_device(device);
-
+    int device_index = get_device_index_by_id(device);
+    const dpct::queue_ptr stream = g_syclStreams[device_index][0];
     size = std::max(size, (size_t)1); // syclMalloc returns null for size 0
 
     void * dev_ptr;
     SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
-                                    size, dpct::get_in_order_queue())));
+                                    size, *stream)));
 
     ggml_backend_buffer_context_sycl * ctx = new ggml_backend_buffer_context_sycl(device, dev_ptr);
 
diff --git a/ggml-sycl.h b/ggml-sycl.h
index 9530c54c2e851..38db8c8e57386 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -49,6 +49,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 
 int get_main_device();
+void print_ggml_tensor(const char*name, struct ggml_tensor *src);
 
 #ifdef  __cplusplus
 }

From bd38129aeb01752183b87db861a62dffb459c774 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Fri, 12 Jan 2024 10:15:06 +0800
Subject: [PATCH 17/90] add print tensor function to debug

---
 ggml-sycl.cpp | 83 +++++++++++++++++++++++++++++++++++++++++----------
 ggml-sycl.h   |  1 +
 run.sh        |  5 ++--
 3 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 5b34289139f3e..28c9e76069507 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -428,15 +428,17 @@ static void bad_arch(const sycl::stream &stream_ct1) {
     (void) bad_arch; // suppress unused function warning
 }
 
-//todo: debug for crash in some case
-void print_ggml_tensor(const char*name, struct ggml_tensor *src){
+void log_ggml_var_device(const char*name, float *src, size_t total_elements, bool src_on_device){
     if(!g_ggml_sycl_debug) return;
+    if(!src){
+        printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
+        return;
+    }
     char filename[1024];
     sprintf(filename, "%s.txt", name);
-    printf("GGML Tensor:%s save to %s:\n", name, filename);
+    printf("GGML Tensor:%s save to %s\n", name, filename);
 
-    size_t total_size = ggml_nbytes(src);
-    const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
+    size_t total_size = total_elements*sizeof(float);
     float *local_buf = NULL;
     // printf("total_size %d2, src_on_device %d\n", total_size, src_on_device);
     if(src_on_device) {
@@ -445,17 +447,16 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
         ggml_sycl_set_device(g_main_device);
         dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
 
-        ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *)  src->extra;
-
         // printf("zjy before memcpy local_buf=%p, src->data=%p\n", local_buf, src->data);
-        main_stream->memcpy(local_buf, src_extra->data_device[g_main_device_index], total_size);
+        printf("zjy log dst_ddf=%p main_stream=%p g_main_device_index=%d\n", src,
+            main_stream, g_main_device_index);
+        main_stream->memcpy(local_buf, src, total_size);
     }
     else {
-        local_buf = (float *)src->data;
+        local_buf = (float *)src;
         // printf("local buf from src-> data %p\n", local_buf);
     }
 
-    size_t total_elements = ggml_nelements(src);
     std::ofstream logfile;
     logfile.open(filename);
     // printf("local buf element %d\n", total_elements);
@@ -466,9 +467,44 @@ void print_ggml_tensor(const char*name, struct ggml_tensor *src){
     logfile <<std::endl;
     logfile.close();
 
-    // printf("before free %p\n", local_buf);
     if(src_on_device) ggml_sycl_host_free(local_buf);
-    // printf("free done\n");
+}
+
+//todo: debug for crash in some case
+void print_ggml_tensor(const char*name, struct ggml_tensor *src){
+    if(!g_ggml_sycl_debug) return;
+    if(!src){
+        printf("GGML Tensor:%s skip to save for NULL pointer\n", name);
+        return;
+    }
+
+    size_t total_elements = ggml_nelements(src);
+
+    const bool src_on_device = src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT;
+    float *src_data =NULL;
+    if(src_on_device) {
+        ggml_tensor_extra_gpu * src_extra = (ggml_tensor_extra_gpu *)  src->extra;
+        src_data = (float*)src_extra->data_device[g_main_device_index];
+    }
+    else {
+        src_data = (float *)src->data;
+    }
+
+    log_ggml_var_device(name, src_data, total_elements, src_on_device);
+}
+
+static int log_file_name_idx=0;
+void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt) {
+    stop_cnt = 4;
+    if(log_file_name_idx>=stop_cnt) return;
+    char filename[1280];
+    sprintf(filename, "%s_%07d", name, log_file_name_idx);
+    log_file_name_idx++;
+    print_ggml_tensor(filename, src);
+    // print_ggml_tensor("ggml_sycl_rms_norm_src0", (ggml_tensor *)src0);
+    // print_ggml_tensor("ggml_sycl_rms_norm_src1", (ggml_tensor *)src1);
+    // int *ptr = NULL;
+    // *ptr = 0;
 }
 
 static __dpct_inline__ float warp_reduce_sum(float x,
@@ -8588,7 +8624,7 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
     if (src->backend == GGML_BACKEND_CPU) {
         kind = dpct::host_to_device;
         src_ptr = (char *) src->data;
-        GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
+        // GGML_SYCL_DEBUG("ggml_sycl_cpy_tensor_2d  GGML_BACKEND_CPU src_ptr %p\n", src_ptr);
     } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
         GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
         kind = dpct::device_to_device;
@@ -8596,10 +8632,10 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
         int id;
         SYCL_CHECK(CHECK_TRY_ERROR(
             id = get_current_device_index()));
-        GGML_SYCL_DEBUG("current device index %d\n", id);
+        // GGML_SYCL_DEBUG("current device index %d\n", id);
         src_ptr = (char *) extra->data_device[id];
     } else {
-        GGML_SYCL_DEBUG("GGML_ASSERT(false)\n");
+        // GGML_SYCL_DEBUG("GGML_ASSERT(false)\n");
         GGML_ASSERT(false);
     }
     char * dst_ptr = (char *) dst;
@@ -8616,7 +8652,7 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
 
     const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
     if (nb0 == ts && nb1 == ts*ne0/bs) {
-        GGML_SYCL_DEBUG("stream->memcpy: dst_ptr=%p, x=%p, size=%lu\n", dst_ptr, x, i1_diff * nb1);
+        // GGML_SYCL_DEBUG("stream->memcpy: dst_ptr=%p, x=%p, size=%lu\n", dst_ptr, x, i1_diff * nb1);
         // return CHECK_TRY_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
         return CHECK_TRY_ERROR(dpct::async_dpct_memcpy(dst_ptr, x, i1_diff * nb1,
                                     kind, *stream));
@@ -9681,6 +9717,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
     }
     if (dst_on_device) {
         dst_ddf = (float *) dst_extra->data_device[g_main_device_index];
+        // printf("zjy dst_ddf=%p main_stream=%p g_main_device_index=%d\n", dst_ddf, main_stream, g_main_device_index);
     } else {
         dst_ddf = dst_f.alloc(ggml_nelements(dst));
     }
@@ -10120,6 +10157,9 @@ static void ggml_sycl_get_rows(const ggml_tensor * src0, const ggml_tensor * src
 static void ggml_sycl_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_add);
+    // log_tensor_with_cnt("log_ggml_sycl_add_src0", (struct ggml_tensor *) src0, 6);
+    // log_tensor_with_cnt("log_ggml_sycl_add_src1", (struct ggml_tensor *)src1, 6);
+    // log_tensor_with_cnt("log_ggml_sycl_add_dst", dst, 6);
 }
 
 static void ggml_sycl_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -10130,6 +10170,10 @@ static void ggml_sycl_acc(const ggml_tensor * src0, const ggml_tensor * src1, gg
 static void ggml_sycl_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_mul);
+    // log_tensor_with_cnt("log_ggml_sycl_mul_src0", (struct ggml_tensor *)src0, 6);
+    // log_tensor_with_cnt("log_ggml_sycl_mul_src1", (struct ggml_tensor *)src1, 6);
+    // log_tensor_with_cnt("log_ggml_sycl_mul_dst", dst, 6);
+
 }
 
 static void ggml_sycl_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -10197,9 +10241,16 @@ static void ggml_sycl_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
     ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pad);
 }
 
+
 static void ggml_sycl_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rms_norm);
+    log_tensor_with_cnt("log_ggml_sycl_rms_norm_src0", (struct ggml_tensor *)src0, 6);
+    log_tensor_with_cnt("log_ggml_sycl_rms_norm_src1", (struct ggml_tensor *)src1, 6);
+    log_tensor_with_cnt("log_ggml_sycl_rms_norm_dst", dst, 6);
+
+    // int *ptr = NULL;
+    // *ptr = 0;
 }
 
 bool ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
diff --git a/ggml-sycl.h b/ggml-sycl.h
index 38db8c8e57386..02b4ab258c3bd 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -50,6 +50,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 
 int get_main_device();
 void print_ggml_tensor(const char*name, struct ggml_tensor *src);
+void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt);
 
 #ifdef  __cplusplus
 }
diff --git a/run.sh b/run.sh
index 39079dd20f9b7..38315a4650f61 100755
--- a/run.sh
+++ b/run.sh
@@ -11,8 +11,9 @@ else
   export GGML_SYCL_DEVICE=0
 fi
 echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
-#export GGML_SYCL_DEBUG=1
+export GGML_SYCL_DEBUG=1
 #export GGML_SYCL_LIST_DEVICE=1
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
+#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
 

From 5b5389941eb9f7fd3672d0174309fa001ed9ccc8 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sat, 13 Jan 2024 19:55:30 +0800
Subject: [PATCH 18/90] fix error: wrong result in
 658746bb26702e50f2c59c0e4ada8e9da6010481

---
 ggml-sycl.cpp | 2 +-
 run.sh        | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 28c9e76069507..1931b80942e2a 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -9712,7 +9712,7 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0,
             src1_ddf = (float *) src1_extra->data_device[g_main_device_index];
         } else {
             src1_ddf = src1_f.alloc(ggml_nelements(src1));
-            // SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
+            SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
         }
     }
     if (dst_on_device) {
diff --git a/run.sh b/run.sh
index 38315a4650f61..254a18efc2c65 100755
--- a/run.sh
+++ b/run.sh
@@ -11,9 +11,9 @@ else
   export GGML_SYCL_DEVICE=0
 fi
 echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
-export GGML_SYCL_DEBUG=1
+#export GGML_SYCL_DEBUG=1
 #export GGML_SYCL_LIST_DEVICE=1
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
-#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
+#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
 

From a47f5ec42e3e78b88be549d3b0146f690fbd1758 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sat, 13 Jan 2024 20:33:42 +0800
Subject: [PATCH 19/90] summary dpct definition in one header file to replace
 folder:dpct

---
 dpct.hpp                           | 2831 ++++++++++++++++
 dpct/atomic.hpp                    |  842 -----
 dpct/blas_utils.hpp                | 1792 ----------
 dpct/ccl_utils.hpp                 |  286 --
 dpct/device.hpp                    |  781 -----
 dpct/dnnl_utils.hpp                | 4921 ----------------------------
 dpct/dpct.hpp                      |   62 -
 dpct/dpl_extras/algorithm.h        | 2419 --------------
 dpct/dpl_extras/dpcpp_extensions.h |  747 -----
 dpct/dpl_extras/functional.h       |  453 ---
 dpct/dpl_extras/iterators.h        |  347 --
 dpct/dpl_extras/memory.h           | 1024 ------
 dpct/dpl_extras/numeric.h          |   32 -
 dpct/dpl_extras/vector.h           |  752 -----
 dpct/dpl_utils.hpp                 |   26 -
 dpct/fft_utils.hpp                 | 1376 --------
 dpct/image.hpp                     |  901 -----
 dpct/kernel.hpp                    |  459 ---
 dpct/lapack_utils.hpp              | 1953 -----------
 dpct/lib_common_utils.hpp          |  174 -
 dpct/math.hpp                      | 1814 ----------
 dpct/memory.hpp                    | 1497 ---------
 dpct/rng_utils.hpp                 |  535 ---
 dpct/sparse_utils.hpp              | 1385 --------
 dpct/util.hpp                      | 1070 ------
 ggml-sycl.cpp                      |    7 +-
 run.sh                             |    2 +-
 27 files changed, 2836 insertions(+), 25652 deletions(-)
 create mode 100644 dpct.hpp
 delete mode 100644 dpct/atomic.hpp
 delete mode 100644 dpct/blas_utils.hpp
 delete mode 100644 dpct/ccl_utils.hpp
 delete mode 100644 dpct/device.hpp
 delete mode 100644 dpct/dnnl_utils.hpp
 delete mode 100644 dpct/dpct.hpp
 delete mode 100644 dpct/dpl_extras/algorithm.h
 delete mode 100644 dpct/dpl_extras/dpcpp_extensions.h
 delete mode 100644 dpct/dpl_extras/functional.h
 delete mode 100644 dpct/dpl_extras/iterators.h
 delete mode 100644 dpct/dpl_extras/memory.h
 delete mode 100644 dpct/dpl_extras/numeric.h
 delete mode 100644 dpct/dpl_extras/vector.h
 delete mode 100644 dpct/dpl_utils.hpp
 delete mode 100644 dpct/fft_utils.hpp
 delete mode 100644 dpct/image.hpp
 delete mode 100644 dpct/kernel.hpp
 delete mode 100644 dpct/lapack_utils.hpp
 delete mode 100644 dpct/lib_common_utils.hpp
 delete mode 100644 dpct/math.hpp
 delete mode 100644 dpct/memory.hpp
 delete mode 100644 dpct/rng_utils.hpp
 delete mode 100644 dpct/sparse_utils.hpp
 delete mode 100644 dpct/util.hpp

diff --git a/dpct.hpp b/dpct.hpp
new file mode 100644
index 0000000000000..874fa13094ae5
--- /dev/null
+++ b/dpct.hpp
@@ -0,0 +1,2831 @@
+// COPY from DPCT head files
+// To clear the code, copy/paste the variable/macro/function from following files.
+// It' possible to get better performance from newer function version DPCT head files.
+// #include <dpct/dpct.hpp>
+// #include <dpct/blas_utils.hpp>
+// #include <dpct/lib_common_utils.hpp>
+// #include <dpct/math.hpp>
+// #include <dpct/utils.hpp>
+// #include <dpct/memory.hpp>
+// #include <dpct/device.hpp>
+// #include <dpct/lapack_utils.hpp>
+
+
+#include <algorithm>
+#include <array>
+#include <cstring>
+#include <iostream>
+#include <mutex>
+#include <set>
+#include <sstream>
+#include <map>
+#include <vector>
+#include <thread>
+
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <mutex>
+#include <unordered_map>
+#include <map>
+#include <utility>
+#include <thread>
+#include <type_traits>
+
+#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
+
+#if defined(__linux__)
+#include <sys/mman.h>
+#elif defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#error "Only support Windows and Linux."
+#endif
+
+#if defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#endif
+#if defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#define DPCT_COMPATIBILITY_TEMP (900)
+
+#if defined(_MSC_VER)
+#define __dpct_align__(n) __declspec(align(n))
+#define __dpct_inline__ __forceinline
+#else
+#define __dpct_align__(n) __attribute__((aligned(n)))
+#define __dpct_inline__ __inline__ __attribute__((always_inline))
+#endif
+
+#if defined(_MSC_VER)
+#define __dpct_noinline__ __declspec(noinline)
+#else
+#define __dpct_noinline__ __attribute__((noinline))
+#endif
+
+namespace dpct
+{
+  typedef sycl::queue *queue_ptr;
+  typedef sycl::event *event_ptr;
+  typedef char *device_ptr;
+  typedef uint8_t byte_t;
+  typedef sycl::buffer<byte_t> buffer_t;
+
+  /// SYCL default exception handler
+  inline auto exception_handler = [](sycl::exception_list exceptions)
+  {
+    for (std::exception_ptr const &e : exceptions)
+    {
+      try
+      {
+        std::rethrow_exception(e);
+      }
+      catch (sycl::exception const &e)
+      {
+        std::cerr << "Caught asynchronous SYCL exception:" << std::endl
+                  << e.what() << std::endl
+                  << "Exception caught at file:" << __FILE__
+                  << ", line:" << __LINE__ << std::endl;
+      }
+    }
+  };
+
+  enum error_code
+  {
+    success = 0,
+    default_error = 999
+  };
+
+  enum memcpy_direction
+  {
+    host_to_host,
+    host_to_device,
+    device_to_host,
+    device_to_device,
+    automatic
+  };
+
+  enum memory_region
+  {
+    global = 0, // device global memory
+    constant,   // device constant memory
+    local,      // device local memory
+    shared,     // memory which can be accessed by host and device
+  };
+
+  enum class library_data_t : unsigned char
+  {
+    real_float = 0,
+    complex_float,
+    real_double,
+    complex_double,
+    real_half,
+    complex_half,
+    real_bfloat16,
+    complex_bfloat16,
+    real_int4,
+    complex_int4,
+    real_uint4,
+    complex_uint4,
+    real_int8,
+    complex_int8,
+    real_uint8,
+    complex_uint8,
+    real_int16,
+    complex_int16,
+    real_uint16,
+    complex_uint16,
+    real_int32,
+    complex_int32,
+    real_uint32,
+    complex_uint32,
+    real_int64,
+    complex_int64,
+    real_uint64,
+    complex_uint64,
+    real_int8_4,
+    real_int8_32,
+    real_uint8_4,
+    library_data_t_size
+  };
+
+  template <typename T>
+  struct DataType
+  {
+    using T2 = T;
+  };
+  template <typename T>
+  struct DataType<sycl::vec<T, 2>>
+  {
+    using T2 = std::complex<T>;
+  };
+
+  static void destroy_event(event_ptr event)
+  {
+    delete event;
+  }
+
+  static inline unsigned int get_tid()
+  {
+#if defined(__linux__)
+    return syscall(SYS_gettid);
+#elif defined(_WIN64)
+    return GetCurrentThreadId();
+#else
+#error "Only support Windows and Linux."
+#endif
+  }
+
+  namespace detail
+  {
+    static void get_version(const sycl::device &dev, int &major, int &minor)
+    {
+      // Version string has the following format:
+      // a. OpenCL<space><major.minor><space><vendor-specific-information>
+      // b. <major.minor>
+      std::string ver;
+      ver = dev.get_info<sycl::info::device::version>();
+      std::string::size_type i = 0;
+      while (i < ver.size())
+      {
+        if (isdigit(ver[i]))
+          break;
+        i++;
+      }
+      major = std::stoi(&(ver[i]));
+      while (i < ver.size())
+      {
+        if (ver[i] == '.')
+          break;
+        i++;
+      }
+      i++;
+      minor = std::stoi(&(ver[i]));
+    }
+
+    template <typename tag, typename T>
+    class generic_error_type
+    {
+    public:
+      generic_error_type() = default;
+      generic_error_type(T value) : value{value} {}
+      operator T() const { return value; }
+
+    private:
+      T value;
+    };
+
+  } // namespace detail
+
+  /// Pitched 2D/3D memory data.
+  class pitched_data
+  {
+  public:
+    pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
+    pitched_data(void *data, size_t pitch, size_t x, size_t y)
+        : _data(data), _pitch(pitch), _x(x), _y(y) {}
+
+    void *get_data_ptr() { return _data; }
+    void set_data_ptr(void *data) { _data = data; }
+
+    size_t get_pitch() { return _pitch; }
+    void set_pitch(size_t pitch) { _pitch = pitch; }
+
+    size_t get_x() { return _x; }
+    void set_x(size_t x) { _x = x; };
+
+    size_t get_y() { return _y; }
+    void set_y(size_t y) { _y = y; }
+
+  private:
+    void *_data;
+    size_t _pitch, _x, _y;
+  };
+
+  class device_info
+  {
+  public:
+    // get interface
+    const char *get_name() const { return _name; }
+    char *get_name() { return _name; }
+    template <typename WorkItemSizesTy = sycl::range<3>,
+              std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                   std::is_same_v<WorkItemSizesTy, int *>,
+                               int> = 0>
+    auto get_max_work_item_sizes() const
+    {
+      if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+        return sycl::range<3>(_max_work_item_sizes_i[0],
+                              _max_work_item_sizes_i[1],
+                              _max_work_item_sizes_i[2]);
+      else
+      {
+        return _max_work_item_sizes_i;
+      }
+    }
+    template <typename WorkItemSizesTy = sycl::range<3>,
+              std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                   std::is_same_v<WorkItemSizesTy, int *>,
+                               int> = 0>
+    auto get_max_work_item_sizes()
+    {
+      if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+        return sycl::range<3>(_max_work_item_sizes_i[0],
+                              _max_work_item_sizes_i[1],
+                              _max_work_item_sizes_i[2]);
+      else
+      {
+        return _max_work_item_sizes_i;
+      }
+    }
+    bool get_host_unified_memory() const { return _host_unified_memory; }
+    int get_major_version() const { return _major; }
+    int get_minor_version() const { return _minor; }
+    int get_integrated() const { return _integrated; }
+    int get_max_clock_frequency() const { return _frequency; }
+    int get_max_compute_units() const { return _max_compute_units; }
+    int get_max_work_group_size() const { return _max_work_group_size; }
+    int get_max_sub_group_size() const { return _max_sub_group_size; }
+    int get_max_work_items_per_compute_unit() const
+    {
+      return _max_work_items_per_compute_unit;
+    }
+    int get_max_register_size_per_work_group() const
+    {
+      return _max_register_size_per_work_group;
+    }
+    template <typename NDRangeSizeTy = size_t *,
+              std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                   std::is_same_v<NDRangeSizeTy, int *>,
+                               int> = 0>
+    auto get_max_nd_range_size() const
+    {
+      if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+        return _max_nd_range_size;
+      else
+        return _max_nd_range_size_i;
+    }
+    template <typename NDRangeSizeTy = size_t *,
+              std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                   std::is_same_v<NDRangeSizeTy, int *>,
+                               int> = 0>
+    auto get_max_nd_range_size()
+    {
+      if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+        return _max_nd_range_size;
+      else
+        return _max_nd_range_size_i;
+    }
+    size_t get_global_mem_size() const { return _global_mem_size; }
+    size_t get_local_mem_size() const { return _local_mem_size; }
+    /// Returns the maximum clock rate of device's global memory in kHz. If
+    /// compiler does not support this API then returns default value 3200000 kHz.
+    unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
+    /// Returns the maximum bus width between device and memory in bits. If
+    /// compiler does not support this API then returns default value 64 bits.
+    unsigned int get_memory_bus_width() const { return _memory_bus_width; }
+    uint32_t get_device_id() const { return _device_id; }
+    std::array<unsigned char, 16> get_uuid() const { return _uuid; }
+    /// Returns global memory cache size in bytes.
+    unsigned int get_global_mem_cache_size() const
+    {
+      return _global_mem_cache_size;
+    }
+
+    // set interface
+    void set_name(const char *name)
+    {
+      size_t length = strlen(name);
+      if (length < 256)
+      {
+        std::memcpy(_name, name, length + 1);
+      }
+      else
+      {
+        std::memcpy(_name, name, 255);
+        _name[255] = '\0';
+      }
+    }
+    void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes)
+    {
+      for (int i = 0; i < 3; ++i)
+        _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+    }
+    [[deprecated]] void
+    set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
+    {
+      for (int i = 0; i < 3; ++i)
+      {
+        _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+      }
+    }
+    void set_host_unified_memory(bool host_unified_memory)
+    {
+      _host_unified_memory = host_unified_memory;
+    }
+    void set_major_version(int major) { _major = major; }
+    void set_minor_version(int minor) { _minor = minor; }
+    void set_integrated(int integrated) { _integrated = integrated; }
+    void set_max_clock_frequency(int frequency) { _frequency = frequency; }
+    void set_max_compute_units(int max_compute_units)
+    {
+      _max_compute_units = max_compute_units;
+    }
+    void set_global_mem_size(size_t global_mem_size)
+    {
+      _global_mem_size = global_mem_size;
+    }
+    void set_local_mem_size(size_t local_mem_size)
+    {
+      _local_mem_size = local_mem_size;
+    }
+    void set_max_work_group_size(int max_work_group_size)
+    {
+      _max_work_group_size = max_work_group_size;
+    }
+    void set_max_sub_group_size(int max_sub_group_size)
+    {
+      _max_sub_group_size = max_sub_group_size;
+    }
+    void
+    set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
+    {
+      _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
+    }
+    void set_max_nd_range_size(int max_nd_range_size[])
+    {
+      for (int i = 0; i < 3; i++)
+      {
+        _max_nd_range_size[i] = max_nd_range_size[i];
+        _max_nd_range_size_i[i] = max_nd_range_size[i];
+      }
+    }
+    void set_memory_clock_rate(unsigned int memory_clock_rate)
+    {
+      _memory_clock_rate = memory_clock_rate;
+    }
+    void set_memory_bus_width(unsigned int memory_bus_width)
+    {
+      _memory_bus_width = memory_bus_width;
+    }
+    void
+    set_max_register_size_per_work_group(int max_register_size_per_work_group)
+    {
+      _max_register_size_per_work_group = max_register_size_per_work_group;
+    }
+    void set_device_id(uint32_t device_id)
+    {
+      _device_id = device_id;
+    }
+    void set_uuid(std::array<unsigned char, 16> uuid)
+    {
+      _uuid = std::move(uuid);
+    }
+    void set_global_mem_cache_size(unsigned int global_mem_cache_size)
+    {
+      _global_mem_cache_size = global_mem_cache_size;
+    }
+
+  private:
+    char _name[256];
+    int _max_work_item_sizes_i[3];
+    bool _host_unified_memory = false;
+    int _major;
+    int _minor;
+    int _integrated = 0;
+    int _frequency;
+    // Set estimated value 3200000 kHz as default value.
+    unsigned int _memory_clock_rate = 3200000;
+    // Set estimated value 64 bits as default value.
+    unsigned int _memory_bus_width = 64;
+    unsigned int _global_mem_cache_size;
+    int _max_compute_units;
+    int _max_work_group_size;
+    int _max_sub_group_size;
+    int _max_work_items_per_compute_unit;
+    int _max_register_size_per_work_group;
+    size_t _global_mem_size;
+    size_t _local_mem_size;
+    size_t _max_nd_range_size[3];
+    int _max_nd_range_size_i[3];
+    uint32_t _device_id;
+    std::array<unsigned char, 16> _uuid;
+  };
+
+  static int get_major_version(const sycl::device &dev)
+  {
+    int major, minor;
+    detail::get_version(dev, major, minor);
+    return major;
+  }
+
+  static int get_minor_version(const sycl::device &dev)
+  {
+    int major, minor;
+    detail::get_version(dev, major, minor);
+    return minor;
+  }
+
+  static void get_device_info(device_info &out, const sycl::device &dev)
+  {
+    device_info prop;
+    prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
+
+    int major, minor;
+    detail::get_version(dev, major, minor);
+    prop.set_major_version(major);
+    prop.set_minor_version(minor);
+
+    prop.set_max_work_item_sizes(
+#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
+        // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
+        // is an enum class element
+        dev.get_info<sycl::info::device::max_work_item_sizes>());
+#else
+        // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
+        // an int
+        dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
+#endif
+    prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
+
+    prop.set_max_clock_frequency(
+        dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
+
+    prop.set_max_compute_units(
+        dev.get_info<sycl::info::device::max_compute_units>());
+    prop.set_max_work_group_size(
+        dev.get_info<sycl::info::device::max_work_group_size>());
+    prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
+    prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
+
+#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
+    if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
+    {
+      unsigned int tmp =
+          dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
+      if (tmp != 0)
+        prop.set_memory_clock_rate(1000 * tmp);
+    }
+    if (dev.has(sycl::aspect::ext_intel_memory_bus_width))
+    {
+      prop.set_memory_bus_width(
+          dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
+    }
+    if (dev.has(sycl::aspect::ext_intel_device_id))
+    {
+      prop.set_device_id(
+          dev.get_info<sycl::ext::intel::info::device::device_id>());
+    }
+    if (dev.has(sycl::aspect::ext_intel_device_info_uuid))
+    {
+      prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
+    }
+#elif defined(_MSC_VER) && !defined(__clang__)
+#pragma message("get_device_info: querying memory_clock_rate and \
+        memory_bus_width are not supported by the compiler used. \
+        Use 3200000 kHz as memory_clock_rate default value. \
+        Use 64 bits as memory_bus_width default value.")
+#else
+#warning "get_device_info: querying memory_clock_rate and \
+        memory_bus_width are not supported by the compiler used. \
+        Use 3200000 kHz as memory_clock_rate default value. \
+        Use 64 bits as memory_bus_width default value."
+#endif
+
+    size_t max_sub_group_size = 1;
+    std::vector<size_t> sub_group_sizes =
+        dev.get_info<sycl::info::device::sub_group_sizes>();
+
+    for (const auto &sub_group_size : sub_group_sizes)
+    {
+      if (max_sub_group_size < sub_group_size)
+        max_sub_group_size = sub_group_size;
+    }
+
+    prop.set_max_sub_group_size(max_sub_group_size);
+
+    prop.set_max_work_items_per_compute_unit(
+        dev.get_info<sycl::info::device::max_work_group_size>());
+    int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+    prop.set_max_nd_range_size(max_nd_range_size);
+
+    // Estimates max register size per work group, feel free to update the value
+    // according to device properties.
+    prop.set_max_register_size_per_work_group(65536);
+
+    prop.set_global_mem_cache_size(
+        dev.get_info<sycl::info::device::global_mem_cache_size>());
+    out = prop;
+  }
+
+  /// dpct device extension
+  class device_ext : public sycl::device
+  {
+    typedef std::mutex mutex_type;
+
+  public:
+    device_ext() : sycl::device(), _ctx(*this) {}
+    ~device_ext()
+    {
+      std::lock_guard<mutex_type> lock(m_mutex);
+      clear_queues();
+    }
+    device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
+    {
+      std::lock_guard<mutex_type> lock(m_mutex);
+      init_queues();
+    }
+
+    int is_native_atomic_supported() { return 0; }
+    int get_major_version() const
+    {
+      return dpct::get_major_version(*this);
+    }
+
+    int get_minor_version() const
+    {
+      return dpct::get_minor_version(*this);
+    }
+
+    int get_max_compute_units() const
+    {
+      return get_device_info().get_max_compute_units();
+    }
+
+    /// Return the maximum clock frequency of this device in KHz.
+    int get_max_clock_frequency() const
+    {
+      return get_device_info().get_max_clock_frequency();
+    }
+
+    int get_integrated() const { return get_device_info().get_integrated(); }
+
+    int get_max_sub_group_size() const
+    {
+      return get_device_info().get_max_sub_group_size();
+    }
+
+    int get_max_register_size_per_work_group() const
+    {
+      return get_device_info().get_max_register_size_per_work_group();
+    }
+
+    int get_max_work_group_size() const
+    {
+      return get_device_info().get_max_work_group_size();
+    }
+
+    int get_mem_base_addr_align() const
+    {
+      return get_info<sycl::info::device::mem_base_addr_align>();
+    }
+
+    size_t get_global_mem_size() const
+    {
+      return get_device_info().get_global_mem_size();
+    }
+
+    /// Get the number of bytes of free and total memory on the SYCL device.
+    /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
+    /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
+    void get_memory_info(size_t &free_memory, size_t &total_memory)
+    {
+#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
+      if (!has(sycl::aspect::ext_intel_free_memory))
+      {
+        std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
+        free_memory = 0;
+      }
+      else
+      {
+        free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
+      }
+#else
+      std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
+      free_memory = 0;
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma message("Querying the number of bytes of free memory is not supported")
+#else
+#warning "Querying the number of bytes of free memory is not supported"
+#endif
+#endif
+      total_memory = get_device_info().get_global_mem_size();
+    }
+
+    void get_device_info(device_info &out) const
+    {
+      dpct::get_device_info(out, *this);
+    }
+
+    device_info get_device_info() const
+    {
+      device_info prop;
+      dpct::get_device_info(prop, *this);
+      return prop;
+    }
+
+    void reset()
+    {
+      std::lock_guard<mutex_type> lock(m_mutex);
+      clear_queues();
+      init_queues();
+    }
+
+    sycl::queue &in_order_queue() { return *_q_in_order; }
+
+    sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
+
+    sycl::queue &default_queue()
+    {
+#ifdef DPCT_USM_LEVEL_NONE
+      return out_of_order_queue();
+#else
+      return in_order_queue();
+#endif // DPCT_USM_LEVEL_NONE
+    }
+
+    void queues_wait_and_throw()
+    {
+      std::unique_lock<mutex_type> lock(m_mutex);
+      std::vector<std::shared_ptr<sycl::queue>> current_queues(
+          _queues);
+      lock.unlock();
+      for (const auto &q : current_queues)
+      {
+        q->wait_and_throw();
+      }
+      // Guard the destruct of current_queues to make sure the ref count is safe.
+      lock.lock();
+    }
+
+    sycl::queue *create_queue(bool enable_exception_handler = false)
+    {
+#ifdef DPCT_USM_LEVEL_NONE
+      return create_out_of_order_queue(enable_exception_handler);
+#else
+      return create_in_order_queue(enable_exception_handler);
+#endif // DPCT_USM_LEVEL_NONE
+    }
+
+    sycl::queue *create_in_order_queue(bool enable_exception_handler = false)
+    {
+      std::lock_guard<mutex_type> lock(m_mutex);
+      return create_queue_impl(enable_exception_handler,
+                               sycl::property::queue::in_order());
+    }
+
+    sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false)
+    {
+      std::lock_guard<mutex_type> lock(m_mutex);
+      return create_queue_impl(enable_exception_handler);
+    }
+
+    void destroy_queue(sycl::queue *&queue)
+    {
+      std::lock_guard<mutex_type> lock(m_mutex);
+      _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                   [=](const std::shared_ptr<sycl::queue> &q) -> bool
+                                   {
+                                     return q.get() == queue;
+                                   }),
+                    _queues.end());
+      queue = nullptr;
+    }
+    void set_saved_queue(sycl::queue *q)
+    {
+      std::lock_guard<mutex_type> lock(m_mutex);
+      _saved_queue = q;
+    }
+    sycl::queue *get_saved_queue() const
+    {
+      std::lock_guard<mutex_type> lock(m_mutex);
+      return _saved_queue;
+    }
+    sycl::context get_context() const { return _ctx; }
+
+  private:
+    void clear_queues()
+    {
+      _queues.clear();
+      _q_in_order = _q_out_of_order = _saved_queue = nullptr;
+    }
+
+    void init_queues()
+    {
+      _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
+      _q_out_of_order = create_queue_impl(true);
+      _saved_queue = &default_queue();
+    }
+
+    /// Caller should acquire resource \p m_mutex before calling this function.
+    template <class... Properties>
+    sycl::queue *create_queue_impl(bool enable_exception_handler,
+                                   Properties... properties)
+    {
+      sycl::async_handler eh = {};
+      if (enable_exception_handler)
+      {
+        eh = exception_handler;
+      }
+      _queues.push_back(std::make_shared<sycl::queue>(
+          _ctx, *this, eh,
+          sycl::property_list(
+#ifdef DPCT_PROFILING_ENABLED
+              sycl::property::queue::enable_profiling(),
+#endif
+              properties...)));
+
+      return _queues.back().get();
+    }
+
+    void get_version(int &major, int &minor) const
+    {
+      detail::get_version(*this, major, minor);
+    }
+    sycl::queue *_q_in_order, *_q_out_of_order;
+    sycl::queue *_saved_queue;
+    sycl::context _ctx;
+    std::vector<std::shared_ptr<sycl::queue>> _queues;
+    mutable mutex_type m_mutex;
+  };
+
+  /// device manager
+  class dev_mgr
+  {
+  public:
+    device_ext &current_device()
+    {
+      unsigned int dev_id = current_device_id();
+      check_id(dev_id);
+      return *_devs[dev_id];
+    }
+    device_ext &cpu_device() const
+    {
+      std::lock_guard<std::recursive_mutex> lock(m_mutex);
+      if (_cpu_device == -1)
+      {
+        throw std::runtime_error("no valid cpu device");
+      }
+      else
+      {
+        return *_devs[_cpu_device];
+      }
+    }
+    device_ext &get_device(unsigned int id) const
+    {
+      std::lock_guard<std::recursive_mutex> lock(m_mutex);
+      check_id(id);
+      return *_devs[id];
+    }
+    unsigned int current_device_id() const
+    {
+      std::lock_guard<std::recursive_mutex> lock(m_mutex);
+      auto it = _thread2dev_map.find(get_tid());
+      if (it != _thread2dev_map.end())
+        return it->second;
+      return DEFAULT_DEVICE_ID;
+    }
+
+    /// Select device with a device ID.
+    /// \param [in] id The id of the device which can
+    /// be obtained through get_device_id(const sycl::device).
+    void select_device(unsigned int id)
+    {
+      std::lock_guard<std::recursive_mutex> lock(m_mutex);
+      check_id(id);
+      _thread2dev_map[get_tid()] = id;
+    }
+    unsigned int device_count() { return _devs.size(); }
+
+    unsigned int get_device_id(const sycl::device &dev)
+    {
+      unsigned int id = 0;
+      for (auto dev_item : _devs)
+      {
+        if (*dev_item == dev)
+        {
+          break;
+        }
+        id++;
+      }
+      return id;
+    }
+
+    template <class DeviceSelector>
+    std::enable_if_t<
+        std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
+    select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
+    {
+      sycl::device selected_device = sycl::device(selector);
+      unsigned int selected_device_id = get_device_id(selected_device);
+      select_device(selected_device_id);
+    }
+
+    /// Returns the instance of device manager singleton.
+    static dev_mgr &instance()
+    {
+      static dev_mgr d_m;
+      return d_m;
+    }
+    dev_mgr(const dev_mgr &) = delete;
+    dev_mgr &operator=(const dev_mgr &) = delete;
+    dev_mgr(dev_mgr &&) = delete;
+    dev_mgr &operator=(dev_mgr &&) = delete;
+
+  private:
+    mutable std::recursive_mutex m_mutex;
+    dev_mgr()
+    {
+      sycl::device default_device =
+          sycl::device(sycl::default_selector_v);
+      _devs.push_back(std::make_shared<device_ext>(default_device));
+
+      std::vector<sycl::device> sycl_all_devs =
+          sycl::device::get_devices(sycl::info::device_type::all);
+      // Collect other devices except for the default device.
+      if (default_device.is_cpu())
+        _cpu_device = 0;
+      for (auto &dev : sycl_all_devs)
+      {
+        if (dev == default_device)
+        {
+          continue;
+        }
+        _devs.push_back(std::make_shared<device_ext>(dev));
+        if (_cpu_device == -1 && dev.is_cpu())
+        {
+          _cpu_device = _devs.size() - 1;
+        }
+      }
+    }
+    void check_id(unsigned int id) const
+    {
+      if (id >= _devs.size())
+      {
+        throw std::runtime_error("invalid device id");
+      }
+    }
+    std::vector<std::shared_ptr<device_ext>> _devs;
+    /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
+    /// thread id in _thread2dev_map, which means default device should be used
+    /// for the current thread.
+    const unsigned int DEFAULT_DEVICE_ID = 0;
+    /// thread-id to device-id map.
+    std::map<unsigned int, unsigned int> _thread2dev_map;
+    int _cpu_device = -1;
+  };
+
+  static inline sycl::queue &get_default_queue()
+  {
+    return dev_mgr::instance().current_device().default_queue();
+  }
+
+  namespace detail
+  {
+    enum class pointer_access_attribute
+    {
+      host_only = 0,
+      device_only,
+      host_device,
+      end
+    };
+
+    static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
+                                                          const void *ptr)
+    {
+#ifdef DPCT_USM_LEVEL_NONE
+      return mem_mgr::instance().is_device_ptr(ptr)
+                 ? pointer_access_attribute::device_only
+                 : pointer_access_attribute::host_only;
+#else
+      switch (sycl::get_pointer_type(ptr, q.get_context()))
+      {
+      case sycl::usm::alloc::unknown:
+        return pointer_access_attribute::host_only;
+      case sycl::usm::alloc::device:
+        return pointer_access_attribute::device_only;
+      case sycl::usm::alloc::shared:
+      case sycl::usm::alloc::host:
+        return pointer_access_attribute::host_device;
+      }
+#endif
+    }
+
+    template <typename ArgT>
+    inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
+    {
+      static_assert((unsigned char)library_data_t::library_data_t_size <=
+                        std::numeric_limits<unsigned char>::max() &&
+                    "library_data_t size exceeds limit.");
+      static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
+      return (std::uint64_t)Val;
+    }
+
+    template <typename FirstT, typename... RestT>
+    inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
+                                                           RestT... RestVal)
+    {
+      static_assert((std::uint8_t)library_data_t::library_data_t_size <=
+                        std::numeric_limits<unsigned char>::max() &&
+                    "library_data_t size exceeds limit.");
+      static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
+      static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
+      return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
+    }
+
+    class mem_mgr
+    {
+      mem_mgr()
+      {
+        // Reserved address space, no real memory allocation happens here.
+#if defined(__linux__)
+        mapped_address_space =
+            (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
+                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#elif defined(_WIN64)
+        mapped_address_space = (byte_t *)VirtualAlloc(
+            NULL,               // NULL specified as the base address parameter
+            mapped_region_size, // Size of allocation
+            MEM_RESERVE,        // Allocate reserved pages
+            PAGE_NOACCESS);     // Protection = no access
+#else
+#error "Only support Windows and Linux."
+#endif
+        next_free = mapped_address_space;
+      };
+
+    public:
+      using buffer_id_t = int;
+
+      struct allocation
+      {
+        buffer_t buffer;
+        byte_t *alloc_ptr;
+        size_t size;
+      };
+
+      ~mem_mgr()
+      {
+#if defined(__linux__)
+        munmap(mapped_address_space, mapped_region_size);
+#elif defined(_WIN64)
+        VirtualFree(mapped_address_space, 0, MEM_RELEASE);
+#else
+#error "Only support Windows and Linux."
+#endif
+      };
+
+      mem_mgr(const mem_mgr &) = delete;
+      mem_mgr &operator=(const mem_mgr &) = delete;
+      mem_mgr(mem_mgr &&) = delete;
+      mem_mgr &operator=(mem_mgr &&) = delete;
+
+      /// Allocate
+      void *mem_alloc(size_t size)
+      {
+        if (!size)
+          return nullptr;
+        std::lock_guard<std::mutex> lock(m_mutex);
+        if (next_free + size > mapped_address_space + mapped_region_size)
+        {
+          throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
+        }
+        // Allocation
+        sycl::range<1> r(size);
+        buffer_t buf(r);
+        allocation A{buf, next_free, size};
+        // Map allocation to device pointer
+        void *result = next_free;
+        m_map.emplace(next_free + size, A);
+        // Update pointer to the next free space.
+        next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
+
+        return result;
+      }
+
+      /// Deallocate
+      void mem_free(const void *ptr)
+      {
+        if (!ptr)
+          return;
+        std::lock_guard<std::mutex> lock(m_mutex);
+        auto it = get_map_iterator(ptr);
+        m_map.erase(it);
+      }
+
+      /// map: device pointer -> allocation(buffer, alloc_ptr, size)
+      allocation translate_ptr(const void *ptr)
+      {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        auto it = get_map_iterator(ptr);
+        return it->second;
+      }
+
+      /// Check if the pointer represents device pointer or not.
+      bool is_device_ptr(const void *ptr) const
+      {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return (mapped_address_space <= ptr) &&
+               (ptr < mapped_address_space + mapped_region_size);
+      }
+
+      /// Returns the instance of memory manager singleton.
+      static mem_mgr &instance()
+      {
+        static mem_mgr m;
+        return m;
+      }
+
+    private:
+      std::map<byte_t *, allocation> m_map;
+      mutable std::mutex m_mutex;
+      byte_t *mapped_address_space;
+      byte_t *next_free;
+      const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
+      const size_t alignment = 256;
+      /// This padding may be defined to some positive value to debug
+      /// out of bound accesses.
+      const size_t extra_padding = 0;
+
+      std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
+      {
+        auto it = m_map.upper_bound((byte_t *)ptr);
+        if (it == m_map.end())
+        {
+          // Not a virtual pointer.
+          throw std::runtime_error("can not get buffer from non-virtual pointer");
+        }
+        const allocation &alloc = it->second;
+        if (ptr < alloc.alloc_ptr)
+        {
+          // Out of bound.
+          // This may happen if there's a gap between allocations due to alignment
+          // or extra padding and pointer points to this gap.
+          throw std::runtime_error("invalid virtual pointer");
+        }
+        return it;
+      }
+    };
+
+    template <class T, memory_region Memory, size_t Dimension>
+    class accessor;
+    template <memory_region Memory, class T = byte_t>
+    class memory_traits
+    {
+    public:
+      static constexpr sycl::access::target target =
+          sycl::access::target::device;
+      static constexpr sycl::access_mode mode =
+          (Memory == constant) ? sycl::access_mode::read
+                               : sycl::access_mode::read_write;
+      static constexpr size_t type_size = sizeof(T);
+      using element_t =
+          typename std::conditional<Memory == constant, const T, T>::type;
+      using value_t = typename std::remove_cv<T>::type;
+      template <size_t Dimension = 1>
+      using accessor_t = typename std::conditional<
+          Memory == local, sycl::local_accessor<value_t, Dimension>,
+          sycl::accessor<T, Dimension, mode, target>>::type;
+      using pointer_t = T *;
+    };
+
+    static inline void *dpct_malloc(size_t size, sycl::queue &q)
+    {
+#ifdef DPCT_USM_LEVEL_NONE
+      return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
+#else
+      return sycl::malloc_device(size, q.get_device(), q.get_context());
+#endif // DPCT_USM_LEVEL_NONE
+    }
+
+#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
+    static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
+                                    sycl::queue &q)
+    {
+      pitch = PITCH_DEFAULT_ALIGN(x);
+      return dpct_malloc(pitch * y * z, q);
+    }
+
+    /**
+     * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
+     * @tparam valueT The type of the element to be set.
+     * @param [in] q The queue in which the operation is done.
+     * @param [in] dev_ptr Pointer to the virtual device memory address.
+     * @param [in] value The value to be set.
+     * @param [in] size Number of elements to be set to the value.
+     * @return An event representing the memset operation.
+     */
+    template <typename valueT>
+    static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
+                                          valueT value, size_t size)
+    {
+#ifdef DPCT_USM_LEVEL_NONE
+      auto &mm = mem_mgr::instance();
+      assert(mm.is_device_ptr(dev_ptr));
+      auto alloc = mm.translate_ptr(dev_ptr);
+      size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
+
+      return q.submit([&](sycl::handler &cgh)
+                      {
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    auto new_buffer = alloc.buffer.reinterpret<valueT>(
+        sycl::range<1>(alloc.size / sizeof(valueT)));
+    sycl::accessor<valueT, 1, sycl::access_mode::write,
+                   sycl::access::target::device>
+        acc(new_buffer, cgh, r, o);
+    cgh.fill(acc, value); });
+#else
+      return q.fill(dev_ptr, value, size);
+#endif // DPCT_USM_LEVEL_NONE
+    }
+
+    /**
+     * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
+     * @tparam valueT The type of the element to be set.
+     * @param [in] q The queue in which the operation is done.
+     * @param [in] data Pointer to the pitched device memory region.
+     * @param [in] value The value to be set.
+     * @param [in] size 3D memory region by number of elements.
+     * @return An event list representing the memset operations.
+     */
+    template <typename valueT>
+    static inline std::vector<sycl::event>
+    dpct_memset(sycl::queue &q, pitched_data data, valueT value,
+                sycl::range<3> size)
+    {
+      std::vector<sycl::event> event_list;
+      size_t slice = data.get_pitch() * data.get_y();
+      unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
+      for (size_t z = 0; z < size.get(2); ++z)
+      {
+        unsigned char *data_ptr = data_surface;
+        for (size_t y = 0; y < size.get(1); ++y)
+        {
+          event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
+          data_ptr += data.get_pitch();
+        }
+        data_surface += slice;
+      }
+      return event_list;
+    }
+
+    /**
+     * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
+     * @tparam valueT The type of the element to be set.
+     * @param [in] q The queue in which the operation is done.
+     * @param [in] ptr Pointer to the virtual device memory.
+     * @param [in] pitch The pitch size by number of elements, including padding.
+     * @param [in] val The value to be set.
+     * @param [in] x The width of memory region by number of elements.
+     * @param [in] y The height of memory region by number of elements.
+     * @return An event list representing the memset operations.
+     */
+    template <typename valueT>
+    static inline std::vector<sycl::event>
+    dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
+                size_t y)
+    {
+      return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
+                         sycl::range<3>(x, y, 1));
+    }
+
+    static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
+                                                    const void *from_ptr,
+                                                    memcpy_direction dir)
+    {
+      switch (dir)
+      {
+      case memcpy_direction::host_to_host:
+      case memcpy_direction::host_to_device:
+      case memcpy_direction::device_to_host:
+      case memcpy_direction::device_to_device:
+        return dir;
+      case memcpy_direction::automatic:
+      {
+        // table[to_attribute][from_attribute]
+        static const memcpy_direction
+            direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
+                           [static_cast<unsigned>(pointer_access_attribute::end)] =
+                               {{memcpy_direction::host_to_host,
+                                 memcpy_direction::device_to_host,
+                                 memcpy_direction::host_to_host},
+                                {memcpy_direction::host_to_device,
+                                 memcpy_direction::device_to_device,
+                                 memcpy_direction::device_to_device},
+                                {memcpy_direction::host_to_host,
+                                 memcpy_direction::device_to_device,
+                                 memcpy_direction::device_to_device}};
+        return direction_table[static_cast<unsigned>(get_pointer_attribute(
+            q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
+      }
+      default:
+        throw std::runtime_error("dpct_memcpy: invalid direction value");
+      }
+    }
+
+    static sycl::event
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+                memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
+    {
+      if (!size)
+        return sycl::event{};
+#ifdef DPCT_USM_LEVEL_NONE
+      auto &mm = mem_mgr::instance();
+      auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+
+      switch (real_direction)
+      {
+      case host_to_host:
+        return q.submit([&](sycl::handler &cgh)
+                        {
+      cgh.depends_on(dep_events);
+      cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
+      case host_to_device:
+      {
+        auto alloc = mm.translate_ptr(to_ptr);
+        size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
+        return q.submit([&](sycl::handler &cgh)
+                        {
+      cgh.depends_on(dep_events);
+      auto r = sycl::range<1>(size);
+      auto o = sycl::id<1>(offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                          sycl::access::target::device>
+          acc(alloc.buffer, cgh, r, o);
+      cgh.copy(from_ptr, acc); });
+      }
+      case device_to_host:
+      {
+        auto alloc = mm.translate_ptr(from_ptr);
+        size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
+        return q.submit([&](sycl::handler &cgh)
+                        {
+      cgh.depends_on(dep_events);
+      auto r = sycl::range<1>(size);
+      auto o = sycl::id<1>(offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                          sycl::access::target::device>
+          acc(alloc.buffer, cgh, r, o);
+      cgh.copy(acc, to_ptr); });
+      }
+      case device_to_device:
+      {
+        auto to_alloc = mm.translate_ptr(to_ptr);
+        auto from_alloc = mm.translate_ptr(from_ptr);
+        size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
+        size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
+        return q.submit([&](sycl::handler &cgh)
+                        {
+      cgh.depends_on(dep_events);
+      auto r = sycl::range<1>(size);
+      auto to_o = sycl::id<1>(to_offset);
+      auto from_o = sycl::id<1>(from_offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                          sycl::access::target::device>
+          to_acc(to_alloc.buffer, cgh, r, to_o);
+      sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                          sycl::access::target::device>
+          from_acc(from_alloc.buffer, cgh, r, from_o);
+      cgh.copy(from_acc, to_acc); });
+      }
+      default:
+        throw std::runtime_error("dpct_memcpy: invalid direction value");
+      }
+#else
+      return q.memcpy(to_ptr, from_ptr, size, dep_events);
+#endif // DPCT_USM_LEVEL_NONE
+    }
+
+    // Get actual copy range and make sure it will not exceed range.
+    static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                        size_t pitch)
+    {
+      return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+    }
+
+    static inline size_t get_offset(sycl::id<3> id, size_t slice,
+                                    size_t pitch)
+    {
+      return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+    }
+
+    /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+    /// and \p from_range to another specified by \p to_ptr and \p to_range.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                sycl::range<3> to_range, sycl::range<3> from_range,
+                sycl::id<3> to_id, sycl::id<3> from_id,
+                sycl::range<3> size, memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
+    {
+      // RAII for host pointer
+      class host_buffer
+      {
+        void *_buf;
+        size_t _size;
+        sycl::queue &_q;
+        const std::vector<sycl::event> &_deps; // free operation depends
+
+      public:
+        host_buffer(size_t size, sycl::queue &q,
+                    const std::vector<sycl::event> &deps)
+            : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+        void *get_ptr() const { return _buf; }
+        size_t get_size() const { return _size; }
+        ~host_buffer()
+        {
+          if (_buf)
+          {
+            _q.submit([&](sycl::handler &cgh)
+                      {
+          cgh.depends_on(_deps);
+          cgh.host_task([buf = _buf] { std::free(buf); }); });
+          }
+        }
+      };
+      std::vector<sycl::event> event_list;
+
+      size_t to_slice = to_range.get(1) * to_range.get(0),
+             from_slice = from_range.get(1) * from_range.get(0);
+      unsigned char *to_surface =
+          (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+      const unsigned char *from_surface =
+          (const unsigned char *)from_ptr +
+          get_offset(from_id, from_slice, from_range.get(0));
+
+      if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
+      {
+        return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                            direction, dep_events)};
+      }
+      direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+      size_t size_slice = size.get(1) * size.get(0);
+      switch (direction)
+      {
+      case host_to_host:
+        for (size_t z = 0; z < size.get(2); ++z)
+        {
+          unsigned char *to_ptr = to_surface;
+          const unsigned char *from_ptr = from_surface;
+          if (to_range.get(0) == from_range.get(0) &&
+              to_range.get(0) == size.get(0))
+          {
+            event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
+                                             direction, dep_events));
+          }
+          else
+          {
+            for (size_t y = 0; y < size.get(1); ++y)
+            {
+              event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
+                                               direction, dep_events));
+              to_ptr += to_range.get(0);
+              from_ptr += from_range.get(0);
+            }
+          }
+          to_surface += to_slice;
+          from_surface += from_slice;
+        }
+        break;
+      case host_to_device:
+      {
+        host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                        event_list);
+        std::vector<sycl::event> host_events;
+        if (to_slice == size_slice)
+        {
+          // Copy host data to a temp host buffer with the shape of target.
+          host_events =
+              dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                          sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                          host_to_host, dep_events);
+        }
+        else
+        {
+          // Copy host data to a temp host buffer with the shape of target.
+          host_events = dpct_memcpy(
+              q, buf.get_ptr(), from_surface, to_range, from_range,
+              sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
+              // If has padding data, not sure whether it is useless. So fill temp
+              // buffer with it.
+              std::vector<sycl::event>{
+                  dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                              device_to_host, dep_events)});
+        }
+        // Copy from temp host buffer to device with only one submit.
+        event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
+                                         buf.get_size(), host_to_device,
+                                         host_events));
+        break;
+      }
+      case device_to_host:
+      {
+        host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                        event_list);
+        // Copy from host temp buffer to host target with reshaping.
+        event_list = dpct_memcpy(
+            q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
+            sycl::id<3>(0, 0, 0), size, host_to_host,
+            // Copy from device to temp host buffer with only one submit.
+            std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
+                                                 buf.get_size(),
+                                                 device_to_host, dep_events)});
+        break;
+      }
+      case device_to_device:
+#ifdef DPCT_USM_LEVEL_NONE
+      {
+        auto &mm = mem_mgr::instance();
+        auto to_alloc = mm.translate_ptr(to_surface);
+        auto from_alloc = mm.translate_ptr(from_surface);
+        size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
+        size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
+        event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                      {
+      cgh.depends_on(dep_events);
+      auto to_o = sycl::id<1>(to_offset);
+      auto from_o = sycl::id<1>(from_offset);
+      sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                         sycl::access::target::device>
+          to_acc(to_alloc.buffer, cgh,
+                 get_copy_range(size, to_slice, to_range.get(0)), to_o);
+      sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                         sycl::access::target::device>
+          from_acc(from_alloc.buffer, cgh,
+                   get_copy_range(size, from_slice, from_range.get(0)), from_o);
+      cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
+          size,
+          [=](sycl::id<3> id) {
+            to_acc[get_offset(id, to_slice, to_range.get(0))] =
+                from_acc[get_offset(id, from_slice, from_range.get(0))];
+          }); }));
+      }
+#else
+        event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                      {
+      cgh.depends_on(dep_events);
+      cgh.parallel_for<class dpct_memcpy_3d_detail>(
+          size,
+          [=](sycl::id<3> id) {
+            to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                from_surface[get_offset(id, from_slice, from_range.get(0))];
+          }); }));
+#endif
+      break;
+      default:
+        throw std::runtime_error("dpct_memcpy: invalid direction value");
+      }
+      return event_list;
+    }
+
+    /// memcpy 2D/3D matrix specified by pitched_data.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                memcpy_direction direction = automatic)
+    {
+      return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                         sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                         sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                         size, direction);
+    }
+
+    /// memcpy 2D matrix with pitch.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                memcpy_direction direction = automatic)
+    {
+      return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                         sycl::range<3>(from_pitch, y, 1),
+                         sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                         sycl::range<3>(x, y, 1), direction);
+    }
+
+    namespace deprecated
+    {
+
+      template <typename T, sycl::usm::alloc AllocKind>
+      class usm_allocator
+      {
+      private:
+        using Alloc = sycl::usm_allocator<T, AllocKind>;
+        Alloc _impl;
+
+      public:
+        using value_type = typename std::allocator_traits<Alloc>::value_type;
+        using pointer = typename std::allocator_traits<Alloc>::pointer;
+        using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
+        using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
+        using const_void_pointer =
+            typename std::allocator_traits<Alloc>::const_void_pointer;
+        using reference = typename std::allocator_traits<Alloc>::value_type &;
+        using const_reference =
+            const typename std::allocator_traits<Alloc>::value_type &;
+        using difference_type =
+            typename std::allocator_traits<Alloc>::difference_type;
+        using size_type = typename std::allocator_traits<Alloc>::size_type;
+        using propagate_on_container_copy_assignment = typename std::allocator_traits<
+            Alloc>::propagate_on_container_copy_assignment;
+        using propagate_on_container_move_assignment = typename std::allocator_traits<
+            Alloc>::propagate_on_container_move_assignment;
+        using propagate_on_container_swap =
+            typename std::allocator_traits<Alloc>::propagate_on_container_swap;
+        using is_always_equal =
+            typename std::allocator_traits<Alloc>::is_always_equal;
+
+        template <typename U>
+        struct rebind
+        {
+          typedef usm_allocator<U, AllocKind> other;
+        };
+
+        usm_allocator() : _impl(dpct::get_default_queue()) {}
+        ~usm_allocator() {}
+        usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
+        usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
+        pointer address(reference r) { return &r; }
+        const_pointer address(const_reference r) { return &r; }
+        pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
+        {
+          return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
+        }
+        void deallocate(pointer p, size_type cnt)
+        {
+          std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
+        }
+        size_type max_size() const
+        {
+          return std::allocator_traits<Alloc>::max_size(_impl);
+        }
+        bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
+        bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
+      };
+
+    } // namespace deprecated
+
+    inline void dpct_free(void *ptr,
+                          const sycl::queue &q)
+    {
+      if (ptr)
+      {
+#ifdef DPCT_USM_LEVEL_NONE
+        detail::mem_mgr::instance().mem_free(ptr);
+#else
+        sycl::free(ptr, q.get_context());
+#endif // DPCT_USM_LEVEL_NONE
+      }
+    }
+
+    template <typename T>
+    inline auto get_memory(const void *x)
+    {
+      T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
+#ifdef DPCT_USM_LEVEL_NONE
+      return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
+#else
+      return new_x;
+#endif
+    }
+
+    template <typename T>
+    inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
+    {
+      using Ty = typename DataType<T>::T2;
+      Ty s_h;
+      if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
+        detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
+            .wait();
+      else
+        s_h = *reinterpret_cast<const Ty *>(s);
+      return s_h;
+    }
+
+  } // namespace detail
+
+  template <typename T>
+  inline auto get_value(const T *s, sycl::queue &q)
+  {
+    return detail::get_value(s, q);
+  }
+
+  namespace detail
+  {
+    template <class Ta, class Tb, class Tc, class Ts>
+    inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                          oneapi::mkl::transpose b_trans, int m, int n, int k,
+                          const void *alpha, const void *a, int lda, const void *b,
+                          int ldb, const void *beta, void *c, int ldc)
+    {
+#ifndef __INTEL_MKL__
+      throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                               "Project does not support this API.");
+#else
+      Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+      Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+      auto data_a = get_memory<const Ta>(a);
+      auto data_b = get_memory<const Tb>(b);
+      auto data_c = get_memory<Tc>(c);
+      oneapi::mkl::blas::column_major::gemm(
+          q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+          data_b, ldb, beta_value, data_c, ldc);
+#endif
+    }
+
+    template <typename VecT, class BinaryOperation, class = void>
+    class vectorized_binary
+    {
+    public:
+      inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
+      {
+        VecT v4;
+        for (size_t i = 0; i < v4.size(); ++i)
+        {
+          v4[i] = binary_op(a[i], b[i]);
+        }
+        return v4;
+      }
+    };
+
+    template <typename VecT, class BinaryOperation>
+    class vectorized_binary<
+        VecT, BinaryOperation,
+        std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>>
+    {
+    public:
+      inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
+      {
+        return binary_op(a, b).template as<VecT>();
+      }
+    };
+
+    template <class Ta, class Tb, class Tc, class Ts>
+    inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                                oneapi::mkl::transpose b_trans, int m, int n, int k,
+                                const void *alpha, const void **a, int lda,
+                                const void **b, int ldb, const void *beta, void **c,
+                                int ldc, int batch_size)
+    {
+      struct matrix_info_t
+      {
+        oneapi::mkl::transpose transpose_info[2];
+        Ts value_info[2];
+        std::int64_t size_info[3];
+        std::int64_t ld_info[3];
+        std::int64_t groupsize_info;
+      };
+
+      Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+      Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+
+      matrix_info_t *matrix_info =
+          (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
+      matrix_info->transpose_info[0] = a_trans;
+      matrix_info->transpose_info[1] = b_trans;
+      matrix_info->value_info[0] = alpha_value;
+      matrix_info->value_info[1] = beta_value;
+      matrix_info->size_info[0] = m;
+      matrix_info->size_info[1] = n;
+      matrix_info->size_info[2] = k;
+      matrix_info->ld_info[0] = lda;
+      matrix_info->ld_info[1] = ldb;
+      matrix_info->ld_info[2] = ldc;
+      matrix_info->groupsize_info = batch_size;
+
+      sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
+          q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
+          matrix_info->size_info, matrix_info->size_info + 1,
+          matrix_info->size_info + 2, matrix_info->value_info,
+          reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
+          reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
+          matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
+          matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
+
+      q.submit([&](sycl::handler &cgh)
+               {
+    cgh.depends_on(e);
+    cgh.host_task([=] { std::free(matrix_info); }); });
+    }
+
+    template <class Ta, class Tb, class Tc, class Ts>
+    inline void
+    gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                    oneapi::mkl::transpose b_trans, int m, int n,
+                    int k, const void *alpha, const void *a, int lda,
+                    long long int stride_a, const void *b, int ldb,
+                    long long int stride_b, const void *beta, void *c,
+                    int ldc, long long int stride_c, int batch_size)
+    {
+      Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+      Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+      auto data_a = get_memory<const Ta>(a);
+      auto data_b = get_memory<const Tb>(b);
+      auto data_c = get_memory<Tc>(c);
+      oneapi::mkl::blas::column_major::gemm_batch(
+          q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+          stride_a, data_b, ldb, stride_b, beta_value,
+          data_c, ldc, stride_c, batch_size);
+    }
+
+  } // namespace detail
+
+  template <typename VecT, class BinaryOperation>
+  inline unsigned vectorized_binary(unsigned a, unsigned b,
+                                    const BinaryOperation binary_op)
+  {
+    sycl::vec<unsigned, 1> v0{a}, v1{b};
+    auto v2 = v0.as<VecT>();
+    auto v3 = v1.as<VecT>();
+    auto v4 =
+        detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
+    v0 = v4.template as<sycl::vec<unsigned, 1>>();
+    return v0;
+  }
+
+  static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
+                                memcpy_direction direction = automatic,
+                                sycl::queue &q = dpct::get_default_queue())
+  {
+    detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
+  }
+
+  static inline unsigned int select_device(unsigned int id)
+  {
+    dev_mgr::instance().select_device(id);
+    return id;
+  }
+
+  template <typename T>
+  T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
+                             int logical_sub_group_size = 32)
+  {
+    unsigned int id = g.get_local_linear_id();
+    unsigned int start_index =
+        id / logical_sub_group_size * logical_sub_group_size;
+    unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
+    return sycl::select_from_group(g, x,
+                                   target_offset < logical_sub_group_size
+                                       ? start_index + target_offset
+                                       : id);
+  }
+
+  template <typename T>
+  sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
+  {
+    return sycl::vec<T, 1>(val)
+        .template as<sycl::vec<
+            std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
+        .template convert<T>();
+  }
+
+  template <typename T1, typename T2>
+  using dot_product_acc_t =
+      std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
+                         uint32_t, int32_t>;
+
+  template <typename T1, typename T2, typename T3>
+  inline auto dp4a(T1 a, T2 b, T3 c)
+  {
+    dot_product_acc_t<T1, T2> res = c;
+    auto va = extract_and_sign_or_zero_extend4(a);
+    auto vb = extract_and_sign_or_zero_extend4(b);
+    res += va[0] * vb[0];
+    res += va[1] * vb[1];
+    res += va[2] * vb[2];
+    res += va[3] * vb[3];
+    return res;
+  }
+
+  struct sub_sat
+  {
+    template <typename T>
+    auto operator()(const T x, const T y) const
+    {
+      return sycl::sub_sat(x, y);
+    }
+  };
+
+  template <typename S, typename T>
+  inline T vectorized_min(T a, T b)
+  {
+    sycl::vec<T, 1> v0{a}, v1{b};
+    auto v2 = v0.template as<S>();
+    auto v3 = v1.template as<S>();
+    auto v4 = sycl::min(v2, v3);
+    v0 = v4.template as<sycl::vec<T, 1>>();
+    return v0;
+  }
+
+  inline float pow(const float a, const int b) { return sycl::pown(a, b); }
+  inline double pow(const double a, const int b) { return sycl::pown(a, b); }
+  inline float pow(const float a, const float b) { return sycl::pow(a, b); }
+  inline double pow(const double a, const double b) { return sycl::pow(a, b); }
+  template <typename T, typename U>
+  inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
+  pow(const T a, const U b)
+  {
+    return sycl::pow(a, static_cast<T>(b));
+  }
+  template <typename T, typename U>
+  inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
+  pow(const T a, const U b)
+  {
+    return sycl::pow(static_cast<double>(a), static_cast<double>(b));
+  }
+
+  inline double min(const double a, const float b)
+  {
+    return sycl::fmin(a, static_cast<double>(b));
+  }
+  inline double min(const float a, const double b)
+  {
+    return sycl::fmin(static_cast<double>(a), b);
+  }
+  inline float min(const float a, const float b) { return sycl::fmin(a, b); }
+  inline double min(const double a, const double b) { return sycl::fmin(a, b); }
+  inline std::uint32_t min(const std::uint32_t a, const std::int32_t b)
+  {
+    return sycl::min(a, static_cast<std::uint32_t>(b));
+  }
+  inline std::uint32_t min(const std::int32_t a, const std::uint32_t b)
+  {
+    return sycl::min(static_cast<std::uint32_t>(a), b);
+  }
+  inline std::int32_t min(const std::int32_t a, const std::int32_t b)
+  {
+    return sycl::min(a, b);
+  }
+  inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b)
+  {
+    return sycl::min(a, b);
+  }
+  inline std::uint64_t min(const std::uint64_t a, const std::int64_t b)
+  {
+    return sycl::min(a, static_cast<std::uint64_t>(b));
+  }
+  inline std::uint64_t min(const std::int64_t a, const std::uint64_t b)
+  {
+    return sycl::min(static_cast<std::uint64_t>(a), b);
+  }
+  inline std::int64_t min(const std::int64_t a, const std::int64_t b)
+  {
+    return sycl::min(a, b);
+  }
+  inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b)
+  {
+    return sycl::min(a, b);
+  }
+  inline std::uint64_t min(const std::uint64_t a, const std::int32_t b)
+  {
+    return sycl::min(a, static_cast<std::uint64_t>(b));
+  }
+  inline std::uint64_t min(const std::int32_t a, const std::uint64_t b)
+  {
+    return sycl::min(static_cast<std::uint64_t>(a), b);
+  }
+  inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b)
+  {
+    return sycl::min(a, static_cast<std::uint64_t>(b));
+  }
+  inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b)
+  {
+    return sycl::min(static_cast<std::uint64_t>(a), b);
+  }
+  // max function overloads.
+  // For floating-point types, `float` or `double` arguments are acceptable.
+  // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
+  // `std::int64_t` type arguments are acceptable.
+  inline double max(const double a, const float b)
+  {
+    return sycl::fmax(a, static_cast<double>(b));
+  }
+  inline double max(const float a, const double b)
+  {
+    return sycl::fmax(static_cast<double>(a), b);
+  }
+  inline float max(const float a, const float b) { return sycl::fmax(a, b); }
+  inline double max(const double a, const double b) { return sycl::fmax(a, b); }
+  inline std::uint32_t max(const std::uint32_t a, const std::int32_t b)
+  {
+    return sycl::max(a, static_cast<std::uint32_t>(b));
+  }
+  inline std::uint32_t max(const std::int32_t a, const std::uint32_t b)
+  {
+    return sycl::max(static_cast<std::uint32_t>(a), b);
+  }
+  inline std::int32_t max(const std::int32_t a, const std::int32_t b)
+  {
+    return sycl::max(a, b);
+  }
+  inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b)
+  {
+    return sycl::max(a, b);
+  }
+  inline std::uint64_t max(const std::uint64_t a, const std::int64_t b)
+  {
+    return sycl::max(a, static_cast<std::uint64_t>(b));
+  }
+  inline std::uint64_t max(const std::int64_t a, const std::uint64_t b)
+  {
+    return sycl::max(static_cast<std::uint64_t>(a), b);
+  }
+  inline std::int64_t max(const std::int64_t a, const std::int64_t b)
+  {
+    return sycl::max(a, b);
+  }
+  inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b)
+  {
+    return sycl::max(a, b);
+  }
+  inline std::uint64_t max(const std::uint64_t a, const std::int32_t b)
+  {
+    return sycl::max(a, static_cast<std::uint64_t>(b));
+  }
+  inline std::uint64_t max(const std::int32_t a, const std::uint64_t b)
+  {
+    return sycl::max(static_cast<std::uint64_t>(a), b);
+  }
+  inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b)
+  {
+    return sycl::max(a, static_cast<std::uint64_t>(b));
+  }
+  inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b)
+  {
+    return sycl::max(static_cast<std::uint64_t>(a), b);
+  }
+
+  inline void
+  has_capability_or_fail(const sycl::device &dev,
+                         const std::initializer_list<sycl::aspect> &props)
+  {
+    for (const auto &it : props)
+    {
+      if (dev.has(it))
+        continue;
+      switch (it)
+      {
+      case sycl::aspect::fp64:
+        throw std::runtime_error("'double' is not supported in '" +
+                                 dev.get_info<sycl::info::device::name>() +
+                                 "' device");
+        break;
+      case sycl::aspect::fp16:
+        throw std::runtime_error("'half' is not supported in '" +
+                                 dev.get_info<sycl::info::device::name>() +
+                                 "' device");
+        break;
+      default:
+#define __SYCL_ASPECT(ASPECT, ID) \
+  case sycl::aspect::ASPECT:      \
+    return #ASPECT;
+#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
+#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
+        auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
+        {
+          switch (AspectNum)
+          {
+#include <sycl/info/aspects.def>
+#include <sycl/info/aspects_deprecated.def>
+          default:
+            return "unknown aspect";
+          }
+        };
+#undef __SYCL_ASPECT_DEPRECATED_ALIAS
+#undef __SYCL_ASPECT_DEPRECATED
+#undef __SYCL_ASPECT
+        throw std::runtime_error(
+            "'" + getAspectNameStr(it) + "' is not supported in '" +
+            dev.get_info<sycl::info::device::name>() + "' device");
+      }
+      break;
+    }
+  }
+
+  static inline unsigned int get_current_device_id()
+  {
+    return dev_mgr::instance().current_device_id();
+  }
+
+  static inline device_ext &get_current_device()
+  {
+    return dev_mgr::instance().current_device();
+  }
+
+  static inline sycl::queue &get_in_order_queue()
+  {
+    return dev_mgr::instance().current_device().in_order_queue();
+  }
+
+  static sycl::event
+  dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+              memcpy_direction direction,
+              const std::vector<sycl::event> &dep_events = {})
+  {
+    if (!size)
+      return sycl::event{};
+#ifdef DPCT_USM_LEVEL_NONE
+    auto &mm = mem_mgr::instance();
+    auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+
+    switch (real_direction)
+    {
+    case host_to_host:
+      return q.submit([&](sycl::handler &cgh)
+                      {
+          cgh.depends_on(dep_events);
+          cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
+    case host_to_device:
+    {
+      auto alloc = mm.translate_ptr(to_ptr);
+      size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
+      return q.submit([&](sycl::handler &cgh)
+                      {
+          cgh.depends_on(dep_events);
+          auto r = sycl::range<1>(size);
+          auto o = sycl::id<1>(offset);
+          sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                              sycl::access::target::device>
+              acc(alloc.buffer, cgh, r, o);
+          cgh.copy(from_ptr, acc); });
+    }
+    case device_to_host:
+    {
+      auto alloc = mm.translate_ptr(from_ptr);
+      size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
+      return q.submit([&](sycl::handler &cgh)
+                      {
+          cgh.depends_on(dep_events);
+          auto r = sycl::range<1>(size);
+          auto o = sycl::id<1>(offset);
+          sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                              sycl::access::target::device>
+              acc(alloc.buffer, cgh, r, o);
+          cgh.copy(acc, to_ptr); });
+    }
+    case device_to_device:
+    {
+      auto to_alloc = mm.translate_ptr(to_ptr);
+      auto from_alloc = mm.translate_ptr(from_ptr);
+      size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
+      size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
+      return q.submit([&](sycl::handler &cgh)
+                      {
+          cgh.depends_on(dep_events);
+          auto r = sycl::range<1>(size);
+          auto to_o = sycl::id<1>(to_offset);
+          auto from_o = sycl::id<1>(from_offset);
+          sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                              sycl::access::target::device>
+              to_acc(to_alloc.buffer, cgh, r, to_o);
+          sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                              sycl::access::target::device>
+              from_acc(from_alloc.buffer, cgh, r, from_o);
+          cgh.copy(from_acc, to_acc); });
+    }
+    default:
+      throw std::runtime_error("dpct_memcpy: invalid direction value");
+    }
+#else
+    return q.memcpy(to_ptr, from_ptr, size, dep_events);
+#endif // DPCT_USM_LEVEL_NONE
+  }
+
+  // Get actual copy range and make sure it will not exceed range.
+  static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                      size_t pitch)
+  {
+    return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+  }
+
+  static inline size_t get_offset(sycl::id<3> id, size_t slice,
+                                  size_t pitch)
+  {
+    return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+  }
+
+  /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+  /// and \p from_range to another specified by \p to_ptr and \p to_range.
+  static inline std::vector<sycl::event>
+  dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+              sycl::range<3> to_range, sycl::range<3> from_range,
+              sycl::id<3> to_id, sycl::id<3> from_id,
+              sycl::range<3> size, memcpy_direction direction,
+              const std::vector<sycl::event> &dep_events = {})
+  {
+    // RAII for host pointer
+    class host_buffer
+    {
+      void *_buf;
+      size_t _size;
+      sycl::queue &_q;
+      const std::vector<sycl::event> &_deps; // free operation depends
+
+    public:
+      host_buffer(size_t size, sycl::queue &q,
+                  const std::vector<sycl::event> &deps)
+          : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+      void *get_ptr() const { return _buf; }
+      size_t get_size() const { return _size; }
+      ~host_buffer()
+      {
+        if (_buf)
+        {
+          _q.submit([&](sycl::handler &cgh)
+                    {
+              cgh.depends_on(_deps);
+              cgh.host_task([buf = _buf] { std::free(buf); }); });
+        }
+      }
+    };
+    std::vector<sycl::event> event_list;
+
+    size_t to_slice = to_range.get(1) * to_range.get(0),
+           from_slice = from_range.get(1) * from_range.get(0);
+    unsigned char *to_surface =
+        (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+    const unsigned char *from_surface =
+        (const unsigned char *)from_ptr +
+        get_offset(from_id, from_slice, from_range.get(0));
+
+    if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
+    {
+      return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                          direction, dep_events)};
+    }
+    direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+    size_t size_slice = size.get(1) * size.get(0);
+    switch (direction)
+    {
+    case host_to_host:
+      for (size_t z = 0; z < size.get(2); ++z)
+      {
+        unsigned char *to_ptr = to_surface;
+        const unsigned char *from_ptr = from_surface;
+        if (to_range.get(0) == from_range.get(0) &&
+            to_range.get(0) == size.get(0))
+        {
+          event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
+                                           direction, dep_events));
+        }
+        else
+        {
+          for (size_t y = 0; y < size.get(1); ++y)
+          {
+            event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
+                                             direction, dep_events));
+            to_ptr += to_range.get(0);
+            from_ptr += from_range.get(0);
+          }
+        }
+        to_surface += to_slice;
+        from_surface += from_slice;
+      }
+      break;
+    case host_to_device:
+    {
+      host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                      event_list);
+      std::vector<sycl::event> host_events;
+      if (to_slice == size_slice)
+      {
+        // Copy host data to a temp host buffer with the shape of target.
+        host_events =
+            dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                        sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                        host_to_host, dep_events);
+      }
+      else
+      {
+        // Copy host data to a temp host buffer with the shape of target.
+        host_events = dpct_memcpy(
+            q, buf.get_ptr(), from_surface, to_range, from_range,
+            sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
+            // If has padding data, not sure whether it is useless. So fill temp
+            // buffer with it.
+            std::vector<sycl::event>{
+                dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                            device_to_host, dep_events)});
+      }
+      // Copy from temp host buffer to device with only one submit.
+      event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
+                                       buf.get_size(), host_to_device,
+                                       host_events));
+      break;
+    }
+    case device_to_host:
+    {
+      host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                      event_list);
+      // Copy from host temp buffer to host target with reshaping.
+      event_list = dpct_memcpy(
+          q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
+          sycl::id<3>(0, 0, 0), size, host_to_host,
+          // Copy from device to temp host buffer with only one submit.
+          std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
+                                               buf.get_size(),
+                                               device_to_host, dep_events)});
+      break;
+    }
+    case device_to_device:
+#ifdef DPCT_USM_LEVEL_NONE
+    {
+      auto &mm = mem_mgr::instance();
+      auto to_alloc = mm.translate_ptr(to_surface);
+      auto from_alloc = mm.translate_ptr(from_surface);
+      size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
+      size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
+      event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                    {
+          cgh.depends_on(dep_events);
+          auto to_o = sycl::id<1>(to_offset);
+          auto from_o = sycl::id<1>(from_offset);
+          sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                             sycl::access::target::device>
+              to_acc(to_alloc.buffer, cgh,
+                     get_copy_range(size, to_slice, to_range.get(0)), to_o);
+          sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                             sycl::access::target::device>
+              from_acc(from_alloc.buffer, cgh,
+                       get_copy_range(size, from_slice, from_range.get(0)), from_o);
+          cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
+              size,
+              [=](sycl::id<3> id) {
+                to_acc[get_offset(id, to_slice, to_range.get(0))] =
+                    from_acc[get_offset(id, from_slice, from_range.get(0))];
+              }); }));
+    }
+#else
+      event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                    {
+          cgh.depends_on(dep_events);
+          cgh.parallel_for<class dpct_memcpy_3d_detail>(
+              size,
+              [=](sycl::id<3> id) {
+                to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                    from_surface[get_offset(id, from_slice, from_range.get(0))];
+              }); }));
+#endif
+    break;
+    default:
+      throw std::runtime_error("dpct_memcpy: invalid direction value");
+    }
+    return event_list;
+  }
+
+  /// memcpy 2D/3D matrix specified by pitched_data.
+  static inline std::vector<sycl::event>
+  dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+              pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+              memcpy_direction direction = automatic)
+  {
+    return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                       sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                       sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                       size, direction);
+  }
+
+  /// memcpy 2D matrix with pitch.
+  static inline std::vector<sycl::event>
+  dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+              size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+              memcpy_direction direction = automatic)
+  {
+    return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                       sycl::range<3>(from_pitch, y, 1),
+                       sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                       sycl::range<3>(x, y, 1), direction);
+  }
+
+  inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                   oneapi::mkl::transpose b_trans, int m, int n, int k,
+                   const void *alpha, const void *a, library_data_t a_type,
+                   int lda, const void *b, library_data_t b_type, int ldb,
+                   const void *beta, void *c, library_data_t c_type, int ldc,
+                   library_data_t scaling_type)
+  {
+    bool matched = false;
+    if (scaling_type == library_data_t::real_float &&
+        c_type == library_data_t::complex_float)
+    {
+      scaling_type = library_data_t::complex_float;
+    }
+    else if (scaling_type == library_data_t::real_double &&
+             c_type == library_data_t::complex_double)
+    {
+      scaling_type = library_data_t::complex_double;
+    }
+
+    std::uint64_t key =
+        detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+    switch (key)
+    {
+    case detail::get_type_combination_id(
+        library_data_t::real_float, library_data_t::real_float,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_impl<float, float, float, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_double, library_data_t::real_double,
+        library_data_t::real_double, library_data_t::real_double):
+    {
+      detail::gemm_impl<double, double, double, double>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::complex_float, library_data_t::complex_float,
+        library_data_t::complex_float, library_data_t::complex_float):
+    {
+      detail::gemm_impl<std::complex<float>, std::complex<float>,
+                        std::complex<float>, std::complex<float>>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::complex_double, library_data_t::complex_double,
+        library_data_t::complex_double, library_data_t::complex_double):
+    {
+      detail::gemm_impl<std::complex<double>, std::complex<double>,
+                        std::complex<double>, std::complex<double>>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_half):
+    {
+      detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                        sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
+                                    lda, b, ldb, beta, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                        float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
+                               ldb, beta, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_impl<sycl::half, sycl::half, float, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_float):
+    {
+      float alpha_value =
+          dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+      float beta_value =
+          dpct::get_value(reinterpret_cast<const float *>(beta), q);
+      sycl::half alpha_half(alpha_value);
+      sycl::half beta_half(beta_value);
+      detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                        sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
+                                    a, lda, b, ldb, &beta_half, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_bfloat16, library_data_t::real_float):
+    {
+      detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                        oneapi::mkl::bfloat16, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_int32, library_data_t::real_int32):
+    {
+      float alpha_float =
+          dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+      float beta_float =
+          dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+      detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
+          q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
+      break;
+    }
+    default:
+      throw std::runtime_error("the combination of data type is unsupported");
+    }
+  } // gemm()
+
+  /// Computes a batch of matrix-matrix product with general matrices.
+  /// \param [in] q The queue where the routine should be executed.
+  /// \param [in] a_trans Specifies the operation applied to A.
+  /// \param [in] b_trans Specifies the operation applied to B.
+  /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+  /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+  /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+  /// \param [in] alpha Scaling factor for the matrix-matrix product.
+  /// \param [in] a Input matrix A.
+  /// \param [in] a_type Data type of the matrix A.
+  /// \param [in] lda Leading dimension of A.
+  /// \param [in] b Input matrix B.
+  /// \param [in] b_type Data type of the matrix B.
+  /// \param [in] ldb Leading dimension of B.
+  /// \param [in] beta Scaling factor for matrix C.
+  /// \param [in, out] c Input/Output matrix C.
+  /// \param [in] c_type Data type of the matrix C.
+  /// \param [in] ldc Leading dimension of C.
+  /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+  /// \param [in] scaling_type Data type of the scaling factors.
+  inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                         oneapi::mkl::transpose b_trans, int m, int n, int k,
+                         const void *alpha, const void *a[],
+                         library_data_t a_type, int lda, const void *b[],
+                         library_data_t b_type, int ldb, const void *beta,
+                         void *c[], library_data_t c_type, int ldc,
+                         int batch_size, library_data_t scaling_type)
+  {
+#ifdef DPCT_USM_LEVEL_NONE
+    throw std::runtime_error("this API is unsupported when USM level is none");
+#else
+    bool matched = false;
+    if (scaling_type == library_data_t::real_float &&
+        c_type == library_data_t::complex_float)
+    {
+      scaling_type = library_data_t::complex_float;
+    }
+    else if (scaling_type == library_data_t::real_double &&
+             c_type == library_data_t::complex_double)
+    {
+      scaling_type = library_data_t::complex_double;
+    }
+
+    std::uint64_t key =
+        detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+    switch (key)
+    {
+    case detail::get_type_combination_id(
+        library_data_t::real_float, library_data_t::real_float,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<float, float, float, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+          batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_double, library_data_t::real_double,
+        library_data_t::real_double, library_data_t::real_double):
+    {
+      detail::gemm_batch_impl<double, double, double, double>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+          batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::complex_float, library_data_t::complex_float,
+        library_data_t::complex_float, library_data_t::complex_float):
+    {
+      detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                              std::complex<float>, std::complex<float>>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+          batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::complex_double, library_data_t::complex_double,
+        library_data_t::complex_double, library_data_t::complex_double):
+    {
+      detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                              std::complex<double>, std::complex<double>>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+          batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_half):
+    {
+      detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                          a, lda, b, ldb, beta, c, ldc,
+                                          batch_size);
+      break;
+    }
+#ifdef __INTEL_MKL__
+    case detail::get_type_combination_id(
+        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_bfloat16, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                              oneapi::mkl::bfloat16, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+          batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                     b, ldb, beta, c, ldc, batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_int32, library_data_t::real_int32):
+    {
+      float alpha_float =
+          dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+      float beta_float =
+          dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+      detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                              float>(q, a_trans, b_trans, m, n, k, &alpha_float,
+                                     a, lda, b, ldb, &beta_float, c, ldc,
+                                     batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+          batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+          batch_size);
+      break;
+    }
+#endif
+    case detail::get_type_combination_id(
+        library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_float):
+    {
+      float alpha_value =
+          dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+      float beta_value =
+          dpct::get_value(reinterpret_cast<const float *>(beta), q);
+      sycl::half alpha_half(alpha_value);
+      sycl::half beta_half(beta_value);
+      detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+          q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
+          batch_size);
+      break;
+    }
+    default:
+      throw std::runtime_error("the combination of data type is unsupported");
+    }
+#endif
+  }
+
+  /// Computes a batch of matrix-matrix product with general matrices.
+  /// \param [in] q The queue where the routine should be executed.
+  /// \param [in] a_trans Specifies the operation applied to A.
+  /// \param [in] b_trans Specifies the operation applied to B.
+  /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+  /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+  /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+  /// \param [in] alpha Scaling factor for the matrix-matrix product.
+  /// \param [in] a Input matrix A.
+  /// \param [in] a_type Data type of the matrix A.
+  /// \param [in] lda Leading dimension of A.
+  /// \param [in] stride_a Stride between the different A matrices.
+  /// \param [in] b Input matrix B.
+  /// \param [in] b_type Data type of the matrix B.
+  /// \param [in] ldb Leading dimension of B.
+  /// \param [in] stride_b Stride between the different B matrices.
+  /// \param [in] beta Scaling factor for matrix C.
+  /// \param [in, out] c Input/Output matrix C.
+  /// \param [in] c_type Data type of the matrix C.
+  /// \param [in] ldc Leading dimension of C.
+  /// \param [in] stride_c Stride between the different C matrices.
+  /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+  /// \param [in] scaling_type Data type of the scaling factors.
+  inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                         oneapi::mkl::transpose b_trans, int m, int n, int k,
+                         const void *alpha, const void *a, library_data_t a_type,
+                         int lda, long long int stride_a, const void *b,
+                         library_data_t b_type, int ldb, long long int stride_b,
+                         const void *beta, void *c, library_data_t c_type,
+                         int ldc, long long int stride_c, int batch_size,
+                         library_data_t scaling_type)
+  {
+    bool matched = false;
+    if (scaling_type == library_data_t::real_float &&
+        c_type == library_data_t::complex_float)
+    {
+      scaling_type = library_data_t::complex_float;
+    }
+    else if (scaling_type == library_data_t::real_double &&
+             c_type == library_data_t::complex_double)
+    {
+      scaling_type = library_data_t::complex_double;
+    }
+
+    std::uint64_t key =
+        detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+    switch (key)
+    {
+    case detail::get_type_combination_id(
+        library_data_t::real_float, library_data_t::real_float,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<float, float, float, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+          beta, c, ldc, stride_c, batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_double, library_data_t::real_double,
+        library_data_t::real_double, library_data_t::real_double):
+    {
+      detail::gemm_batch_impl<double, double, double, double>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+          beta, c, ldc, stride_c, batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::complex_float, library_data_t::complex_float,
+        library_data_t::complex_float, library_data_t::complex_float):
+    {
+      detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                              std::complex<float>, std::complex<float>>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+          beta, c, ldc, stride_c, batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::complex_double, library_data_t::complex_double,
+        library_data_t::complex_double, library_data_t::complex_double):
+    {
+      detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                              std::complex<double>, std::complex<double>>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+          beta, c, ldc, stride_c, batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_half):
+    {
+      detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                          a, lda, stride_a, b, ldb, stride_b,
+                                          beta, c, ldc, stride_c, batch_size);
+      break;
+    }
+#ifdef __INTEL_MKL__
+    case detail::get_type_combination_id(
+        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_bfloat16, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                              oneapi::mkl::bfloat16, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+          beta, c, ldc, stride_c, batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                     stride_a, b, ldb, stride_b, beta, c, ldc,
+                                     stride_c, batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_int32, library_data_t::real_int32):
+    {
+      detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                              std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
+                                            a, lda, stride_a, b, ldb, stride_b,
+                                            beta, c, ldc, stride_c, batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_int8, library_data_t::real_int8,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+          beta, c, ldc, stride_c, batch_size);
+      break;
+    }
+    case detail::get_type_combination_id(
+        library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_float, library_data_t::real_float):
+    {
+      detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+          beta, c, ldc, stride_c, batch_size);
+      break;
+    }
+#endif
+    case detail::get_type_combination_id(
+        library_data_t::real_half, library_data_t::real_half,
+        library_data_t::real_half, library_data_t::real_float):
+    {
+      float alpha_value =
+          dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+      float beta_value =
+          dpct::get_value(reinterpret_cast<const float *>(beta), q);
+      sycl::half alpha_half(alpha_value);
+      sycl::half beta_half(beta_value);
+      detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+          q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
+          &beta_half, c, ldc, stride_c, batch_size);
+      break;
+    }
+    default:
+      throw std::runtime_error("the combination of data type is unsupported");
+    }
+  }
+
+  static inline void
+  async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
+                    size_t from_pitch, size_t x, size_t y,
+                    memcpy_direction direction = automatic,
+                    sycl::queue &q = get_default_queue())
+  {
+    detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
+                        direction);
+  }
+
+  using err0 = detail::generic_error_type<struct err0_tag, int>;
+  using err1 = detail::generic_error_type<struct err1_tag, int>;
+
+} // COPY from DPCT head files
\ No newline at end of file
diff --git a/dpct/atomic.hpp b/dpct/atomic.hpp
deleted file mode 100644
index 4b516f5304023..0000000000000
--- a/dpct/atomic.hpp
+++ /dev/null
@@ -1,842 +0,0 @@
-//==---- atomic.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_ATOMIC_HPP__
-#define __DPCT_ATOMIC_HPP__
-
-#include <sycl/sycl.hpp>
-
-namespace dpct {
-
-/// Atomically add the value operand to the value at the addr and assign the
-/// result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to add to the value at \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_add(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_add(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_add(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_add(operand);
-}
-
-/// Atomically add the value operand to the value at the addr and assign the
-/// result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to add to the value at \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_add(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_add<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_add<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_add<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_add(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_add<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically subtract the value operand from the value at the addr and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to subtract from the value at \p addr
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_sub(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_sub(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_sub(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_sub(operand);
-}
-
-/// Atomically subtract the value operand from the value at the addr and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to subtract from the value at \p addr
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_sub(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_sub<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_sub(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_sub<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically perform a bitwise AND between the value operand and the value at the addr
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_and(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_and(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_and(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_and(operand);
-}
-
-/// Atomically perform a bitwise AND between the value operand and the value at the addr
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise AND operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_and(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_and<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_and<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_and<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_and(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_and<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically or the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_or(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_or(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_or(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_or(operand);
-}
-
-/// Atomically or the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise OR operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_or(T *addr, T operand,
-                         sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_or<T, addressSpace, sycl::memory_order::relaxed,
-                           sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_or<T, addressSpace, sycl::memory_order::acq_rel,
-                           sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_or<T, addressSpace, sycl::memory_order::seq_cst,
-                           sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_or(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_or<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically xor the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_xor(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_xor(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_xor(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_xor(operand);
-}
-
-/// Atomically xor the value at the addr with the value operand, and assign
-/// the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to use in bitwise XOR operation with the value at the \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_xor(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_xor<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_xor(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_xor<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically calculate the minimum of the value at addr and the value operand
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_min(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_min(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_min(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_min(operand);
-}
-
-/// Atomically calculate the minimum of the value at addr and the value operand
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_min(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_min<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_min<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_min<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_min(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_min<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically calculate the maximum of the value at addr and the value operand
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_fetch_max(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_max(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_fetch_max(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.fetch_max(operand);
-}
-
-/// Atomically calculate the maximum of the value at addr and the value operand
-/// and assign the result to the value at addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_fetch_max(T *addr, T operand,
-                          sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_max<T, addressSpace, sycl::memory_order::relaxed,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_max<T, addressSpace, sycl::memory_order::acq_rel,
-                            sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_max<T, addressSpace, sycl::memory_order::seq_cst,
-                            sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_fetch_max(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_fetch_max<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically set \p operand to the value stored in \p addr, if old value stored in
-/// \p addr is equal to zero or greater than \p operand, else decrease the value stored
-/// in \p addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The threshold value.
-/// \param memoryOrder The memory ordering used.
-/// \returns The old value stored in \p addr.
-template <sycl::access::address_space addressSpace = sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline unsigned int atomic_fetch_compare_dec(unsigned int *addr,
-                                             unsigned int operand) {
-  auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
-                                  addressSpace>(addr[0]);
-  unsigned int old;
-
-	while (true) {
-	  old = atm.load();
-	  if (old == 0 || old > operand) {
-		  if (atm.compare_exchange_strong(old, operand))
-        break;
-	  } else if (atm.compare_exchange_strong(old, old - 1))
-      break;
-	}
-
-  return old;
-}
-
-/// Atomically increment the value stored in \p addr if old value stored in \p
-/// addr is less than \p operand, else set 0 to the value stored in \p addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The threshold value.
-/// \param memoryOrder The memory ordering used.
-/// \returns The old value stored in \p addr.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline unsigned int atomic_fetch_compare_inc(unsigned int *addr,
-                                             unsigned int operand) {
-  auto atm = sycl::atomic_ref<unsigned int, memoryOrder, memoryScope,
-                                  addressSpace>(addr[0]);
-  unsigned int old;
-  while (true) {
-    old = atm.load();
-    if (old >= operand) {
-      if (atm.compare_exchange_strong(old, 0))
-        break;
-    } else if (atm.compare_exchange_strong(old, old + 1))
-      break;
-  }
-  return old;
-}
-
-/// Atomically increment the value stored in \p addr if old value stored in \p
-/// addr is less than \p operand, else set 0 to the value stored in \p addr.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The threshold value.
-/// \param memoryOrder The memory ordering used.
-/// \returns The old value stored in \p addr.
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space>
-inline unsigned int
-atomic_fetch_compare_inc(unsigned int *addr, unsigned int operand,
-                         sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::relaxed,
-                                    sycl::memory_scope::device>(addr,
-                                                                   operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::acq_rel,
-                                    sycl::memory_scope::device>(addr,
-                                                                   operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_fetch_compare_inc<addressSpace, sycl::memory_order::seq_cst,
-                                    sycl::memory_scope::device>(addr,
-                                                                   operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-/// Atomically exchange the value at the address addr with the value operand.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to be exchanged with the value pointed by \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-inline T atomic_exchange(T *addr, T operand) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.exchange(operand);
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2>
-inline T1 atomic_exchange(T1 *addr, T2 operand) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  return atm.exchange(operand);
-}
-
-/// Atomically exchange the value at the address addr with the value operand.
-/// \param [in, out] addr The pointer to the data.
-/// \param operand The value to be exchanged with the value pointed by \p addr.
-/// \param memoryOrder The memory ordering used.
-/// \returns The value at the \p addr before the call.
-template <typename T, sycl::access::address_space addressSpace =
-                          sycl::access::address_space::global_space>
-inline T atomic_exchange(T *addr, T operand,
-                         sycl::memory_order memoryOrder) {
-  switch (memoryOrder) {
-  case sycl::memory_order::relaxed:
-    return atomic_exchange<T, addressSpace, sycl::memory_order::relaxed,
-                           sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::acq_rel:
-    return atomic_exchange<T, addressSpace, sycl::memory_order::acq_rel,
-                           sycl::memory_scope::device>(addr, operand);
-  case sycl::memory_order::seq_cst:
-    return atomic_exchange<T, addressSpace, sycl::memory_order::seq_cst,
-                           sycl::memory_scope::device>(addr, operand);
-  default:
-    assert(false && "Invalid memory_order for atomics. Valid memory_order for "
-                    "atomics are: sycl::memory_order::relaxed, "
-                    "sycl::memory_order::acq_rel, sycl::memory_order::seq_cst!");
-  }
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          typename T1, typename T2>
-inline T1 atomic_exchange(T1 *addr, T2 operand,
-                           sycl::memory_order memoryOrder) {
-  atomic_exchange<T1, addressSpace>(addr, operand, memoryOrder);
-}
-
-/// Atomically compare the value at \p addr to the value expected and exchange
-/// with the value desired if the value at \p addr is equal to the value expected.
-/// Returns the value at the \p addr before the call.
-/// \param [in, out] addr Multi_ptr.
-/// \param expected The value to compare against the value at \p addr.
-/// \param desired The value to assign to \p addr if the value at \p addr is expected.
-/// \param success The memory ordering used when comparison succeeds.
-/// \param fail The memory ordering used when comparison fails.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_compare_exchange_strong(
-    sycl::multi_ptr<T, addressSpace> addr, T expected, T desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  auto atm = sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(*addr);
-
-  atm.compare_exchange_strong(expected, desired, success, fail);
-  return expected;
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2, typename T3>
-T1 atomic_compare_exchange_strong(
-    sycl::multi_ptr<T1, addressSpace> addr, T2 expected, T3 desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(*addr);
-  T1 expected_value = expected;
-  atm.compare_exchange_strong(expected_value, desired, success, fail);
-  return expected_value;
-}
-
-/// Atomically compare the value at \p addr to the value expected and exchange
-/// with the value desired if the value at \p addr is equal to the value expected.
-/// Returns the value at the \p addr before the call.
-/// \param [in] addr The pointer to the data.
-/// \param expected The value to compare against the value at \p addr.
-/// \param desired The value to assign to \p addr if the value at \p addr is expected.
-/// \param success The memory ordering used when comparison succeeds.
-/// \param fail The memory ordering used when comparison fails.
-/// \returns The value at the \p addr before the call.
-template <typename T,
-          sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device>
-T atomic_compare_exchange_strong(
-    T *addr, T expected, T desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  auto atm =
-      sycl::atomic_ref<T, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  atm.compare_exchange_strong(expected, desired, success, fail);
-  return expected;
-}
-
-template <sycl::access::address_space addressSpace =
-              sycl::access::address_space::global_space,
-          sycl::memory_order memoryOrder = sycl::memory_order::relaxed,
-          sycl::memory_scope memoryScope = sycl::memory_scope::device,
-          typename T1, typename T2, typename T3>
-T1 atomic_compare_exchange_strong(
-    T1 *addr, T2 expected, T3 desired,
-    sycl::memory_order success = sycl::memory_order::relaxed,
-    sycl::memory_order fail = sycl::memory_order::relaxed) {
-  T1 expected_value = expected;
-  auto atm =
-      sycl::atomic_ref<T1, memoryOrder, memoryScope, addressSpace>(addr[0]);
-  atm.compare_exchange_strong(expected_value, desired, success, fail);
-  return expected_value;
-}
-
-/// Atomic extension to implement standard APIs in std::atomic
-namespace detail{
-template <typename T> struct IsValidAtomicType {
-  static constexpr bool value =
-      (std::is_same<T, int>::value || std::is_same<T, unsigned int>::value ||
-       std::is_same<T, long>::value || std::is_same<T, unsigned long>::value ||
-       std::is_same<T, long long>::value ||
-       std::is_same<T, unsigned long long>::value ||
-       std::is_same<T, float>::value || std::is_same<T, double>::value ||
-       std::is_pointer<T>::value);
-};
-} // namespace detail
-
-template <typename T,
-          sycl::memory_scope DefaultScope = sycl::memory_scope::system,
-          sycl::memory_order DefaultOrder = sycl::memory_order::seq_cst,
-          sycl::access::address_space Space =
-              sycl::access::address_space::generic_space>
-class atomic{
-  static_assert(
-    detail::IsValidAtomicType<T>::value,
-    "Invalid atomic type.  Valid types are int, unsigned int, long, "
-      "unsigned long, long long, unsigned long long, float, double "
-      "and pointer types");
-  T __d;
-
-public:
-  /// default memory synchronization order
-  static constexpr sycl::memory_order default_read_order =
-      sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_read_order;
-  static constexpr sycl::memory_order default_write_order =
-      sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space>::default_write_order;
-  static constexpr sycl::memory_scope default_scope = DefaultScope;
-  static constexpr sycl::memory_order default_read_modify_write_order =
-      DefaultOrder;
-  
-
-  /// Default constructor.
-  constexpr atomic() noexcept = default;
-  /// Constructor with initialize value.
-  constexpr atomic(T d) noexcept : __d(d){};
-
-  /// atomically replaces the value of the referenced object with a non-atomic argument
-  /// \param operand The value to replace the pointed value.
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  void store(T operand, sycl::memory_order memoryOrder = default_write_order,
-             sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    atm.store(operand, memoryOrder, memoryScope);
-  }
-
-  /// atomically obtains the value of the referenced object
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object
-  T load(sycl::memory_order memoryOrder = default_read_order,
-         sycl::memory_scope memoryScope = default_scope) const noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(
-      const_cast<T &>(__d));
-    return atm.load(memoryOrder, memoryScope);
-  }
-
-  /// atomically replaces the value of the referenced object and obtains the value held previously
-  /// \param operand The value to replace the pointed value.
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object before the call.
-  T exchange(T operand,
-             sycl::memory_order memoryOrder = default_read_modify_write_order,
-             sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.exchange(operand, memoryOrder, memoryScope);
-  }
-
-  /// atomically compares the value of the referenced object with non-atomic argument 
-  /// and performs atomic exchange if equal or atomic load if not
-  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
-  /// \param desired  The value to store in the referenced object if it is as expected
-  /// \param success The memory models for the read-modify-write
-  /// \param failure The memory models for load operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false otherwise.
-  bool compare_exchange_weak(
-      T &expected, T desired,
-      sycl::memory_order success, sycl::memory_order failure,
-      sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_weak(expected, desired, success, failure, memoryScope);
-  }
-  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
-  /// \param desired  The value to store in the referenced object if it is as expected
-  /// \param memoryOrder 	The memory synchronization ordering for operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false otherwise.
-  bool compare_exchange_weak(T &expected, T desired,
-                  sycl::memory_order memoryOrder = default_read_modify_write_order,
-                  sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_weak(expected, desired, memoryOrder, memoryScope);
-  }
-
-  /// atomically compares the value of the referenced object with non-atomic argument 
-  /// and performs atomic exchange if equal or atomic load if not
-  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
-  /// \param desired  The value to store in the referenced object if it is as expected
-  /// \param success The memory models for the read-modify-write
-  /// \param failure The memory models for load operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false otherwise.
-  bool compare_exchange_strong(
-      T &expected, T desired,
-      sycl::memory_order success, sycl::memory_order failure,
-      sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_strong(expected, desired, success, failure, memoryScope);
-  }
-  /// \param expected The value expected to be found in the object referenced by the atomic_ref object
-  /// \param desired  The value to store in the referenced object if it is as expected
-  /// \param memoryOrder 	The memory synchronization ordering for operations
-  /// \param memoryScope The memory scope used.
-  /// \returns true if the referenced object was successfully changed, false otherwise.
-  bool compare_exchange_strong(T &expected, T desired,
-                    sycl::memory_order memoryOrder = default_read_modify_write_order,
-                    sycl::memory_scope memoryScope = default_scope) noexcept {
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.compare_exchange_strong(expected, desired, memoryOrder, memoryScope);
-  }
-
-  /// atomically adds the argument to the value stored in the atomic object and obtains the value held previously
-  /// \param operand 	The other argument of arithmetic addition
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object before the call.
-  T fetch_add(T operand,
-              sycl::memory_order memoryOrder = default_read_modify_write_order,
-              sycl::memory_scope  memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.fetch_add(operand, memoryOrder,  memoryScope);
-  }
-
-  /// atomically subtracts the argument from the value stored in the atomic object and obtains the value held previously
-  /// \param operand 	The other argument of arithmetic subtraction
-  /// \param memoryOrder The memory ordering used.
-  /// \param memoryScope The memory scope used.
-  /// \returns The value of the referenced object before the call.
-  T fetch_sub(T operand,
-              sycl::memory_order memoryOrder = default_read_modify_write_order,
-              sycl::memory_scope memoryScope = default_scope) noexcept {
-
-    sycl::atomic_ref<T, DefaultOrder, DefaultScope, Space> atm(__d);
-    return atm.fetch_sub(operand, memoryOrder, memoryScope);
-  }
-};
-
-} // namespace dpct
-#endif // __DPCT_ATOMIC_HPP__
diff --git a/dpct/blas_utils.hpp b/dpct/blas_utils.hpp
deleted file mode 100644
index df222c528bc08..0000000000000
--- a/dpct/blas_utils.hpp
+++ /dev/null
@@ -1,1792 +0,0 @@
-//==---- blas_utils.hpp----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_BLAS_UTILS_HPP__
-#define __DPCT_BLAS_UTILS_HPP__
-
-#include "memory.hpp"
-#include "util.hpp"
-#include "lib_common_utils.hpp"
-#include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
-#include <utility>
-#include <vector>
-#include <thread>
-
-namespace dpct {
-
-/// Get the value of \p s.
-/// Copy the data to host synchronously, then return the data.
-/// \param [in] p The pointer points the data.
-/// \param [in] q The queue where the memory copy should be executed.
-template <typename T>
-inline auto get_value(const T *s, sycl::queue &q) {
-  return detail::get_value(s, q);
-}
-
-namespace detail {
-inline void mem_free(sycl::queue *exec_queue,
-                     std::vector<void *> pointers_array, sycl::event e) {
-  e.wait();
-  for (auto p : pointers_array)
-    sycl::free(p, *exec_queue);
-}
-
-inline int stride_for(int num_elems, int mem_align_in_elems) {
-  return ((num_elems - 1) / mem_align_in_elems + 1) * mem_align_in_elems;
-}
-
-#ifndef DPCT_USM_LEVEL_NONE
-template<typename T>
-class working_memory {
-  T *_input_ptr;
-  T *_temp_ptr;
-  bool _is_sycl_malloced = false;
-  bool _is_scalar_value = false;
-  sycl::queue _q;
-  sycl::event _e;
-
-public:
-  working_memory(size_t size, sycl::queue q) : _q(q) {
-    _is_scalar_value = false;
-    _temp_ptr = (T *)sycl::malloc_device(size, q);
-  }
-  working_memory(T *result_ptr, sycl::queue q) : _input_ptr(result_ptr), _q(q) {
-    _is_scalar_value = true;
-    _is_sycl_malloced = sycl::get_pointer_type(_input_ptr, _q.get_context()) !=
-                        sycl::usm::alloc::unknown;
-    if (!_is_sycl_malloced)
-      _temp_ptr = sycl::malloc_shared<T>(1, _q);
-  }
-  auto get_ptr() {
-    if (_is_scalar_value && _is_sycl_malloced)
-      return _input_ptr;
-    return _temp_ptr;
-  }
-  void set_event(sycl::event e) { _e = e; }
-  ~working_memory() {
-    if (_is_scalar_value) {
-      if (!_is_sycl_malloced) {
-        _q.memcpy(_input_ptr, _temp_ptr, sizeof(T)).wait();
-        sycl::free(_temp_ptr, _q);
-      }
-    } else {
-      std::vector<void *> ptrs{_temp_ptr};
-      dpct::async_dpct_free(ptrs, {_e});
-    }
-  }
-};
-#endif
-
-template <typename Tx, typename Tr>
-inline void nrm2_impl(sycl::queue &q, int n, const void *x, int incx,
-                         void *result) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-#ifdef DPCT_USM_LEVEL_NONE
-  auto x_buffer = dpct::get_buffer<Tx>(x);
-  auto r_buffer =
-      sycl::buffer<Tr, 1>(reinterpret_cast<Tr *>(result), sycl::range<1>(1));
-  if (dpct::is_device_ptr(result))
-    r_buffer = dpct::get_buffer<Tr>(result);
-  oneapi::mkl::blas::column_major::nrm2(q, n, x_buffer, incx, r_buffer);
-#else
-  working_memory<Tr> res_mem(reinterpret_cast<Tr *>(result), q);
-  oneapi::mkl::blas::column_major::nrm2(q, n, reinterpret_cast<const Tx *>(x),
-                                        incx, res_mem.get_ptr());
-#endif
-#endif
-}
-
-template <bool is_conjugate, class Txy, class Tr>
-inline void dotuc_impl(sycl::queue &q, int n, const Txy *x, int incx,
-                          const Txy *y, int incy, Tr *result) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-#ifdef DPCT_USM_LEVEL_NONE
-  auto x_buffer = dpct::get_buffer<Txy>(x);
-  auto y_buffer = dpct::get_buffer<Txy>(y);
-  auto r_buffer = sycl::buffer<Tr, 1>((Tr *)result, sycl::range<1>(1));
-  if (dpct::is_device_ptr(result))
-    r_buffer = dpct::get_buffer<Tr>(result);
-  if constexpr (std::is_same_v<Txy, std::complex<float>> ||
-                std::is_same_v<Txy, std::complex<double>>) {
-    if constexpr (is_conjugate)
-      oneapi::mkl::blas::column_major::dotc(q, n, x_buffer, incx, y_buffer,
-                                            incy, r_buffer);
-    else
-      oneapi::mkl::blas::column_major::dotu(q, n, x_buffer, incx, y_buffer,
-                                            incy, r_buffer);
-  } else
-    oneapi::mkl::blas::column_major::dot(q, n, x_buffer, incx, y_buffer, incy,
-                                         r_buffer);
-#else
-  working_memory<Tr> res_mem(result, q);
-  if constexpr (std::is_same_v<Txy, std::complex<float>> ||
-                std::is_same_v<Txy, std::complex<double>>) {
-    if constexpr (is_conjugate)
-      oneapi::mkl::blas::column_major::dotc(q, n, x, incx, y, incy, res_mem.get_ptr());
-    else
-      oneapi::mkl::blas::column_major::dotu(q, n, x, incx, y, incy, res_mem.get_ptr());
-  } else
-    oneapi::mkl::blas::column_major::dot(q, n, x, incx, y, incy, res_mem.get_ptr());
-#endif
-#endif
-}
-
-template <bool is_conjugate>
-inline void dotuc(sycl::queue &q, int n, const void *x,
-                     library_data_t x_type, int incx, const void *y,
-                     library_data_t y_type, int incy, void *result,
-                     library_data_t result_type) {
-  std::uint64_t key = detail::get_type_combination_id(x_type, y_type, result_type);
-  switch (key) {
-  case detail::get_type_combination_id(library_data_t::real_float, library_data_t::real_float,
-                       library_data_t::real_float): {
-    detail::dotuc_impl<is_conjugate>(
-        q, n, reinterpret_cast<const float *>(x), incx,
-        reinterpret_cast<const float *>(y), incy,
-        reinterpret_cast<float *>(result));
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_double, library_data_t::real_double,
-                       library_data_t::real_double): {
-    detail::dotuc_impl<is_conjugate>(
-        q, n, reinterpret_cast<const double *>(x), incx,
-        reinterpret_cast<const double *>(y), incy,
-        reinterpret_cast<double *>(result));
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_float,
-                       library_data_t::complex_float,
-                       library_data_t::complex_float): {
-    detail::dotuc_impl<is_conjugate>(
-        q, n, reinterpret_cast<const std::complex<float> *>(x), incx,
-        reinterpret_cast<const std::complex<float> *>(y), incy,
-        reinterpret_cast<std::complex<float> *>(result));
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_double,
-                       library_data_t::complex_double,
-                       library_data_t::complex_double): {
-    detail::dotuc_impl<is_conjugate>(
-        q, n, reinterpret_cast<const std::complex<double> *>(x), incx,
-        reinterpret_cast<const std::complex<double> *>(y), incy,
-        reinterpret_cast<std::complex<double> *>(result));
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_half, library_data_t::real_half,
-                       library_data_t::real_half): {
-    detail::dotuc_impl<is_conjugate>(
-        q, n, reinterpret_cast<const sycl::half *>(x), incx,
-        reinterpret_cast<const sycl::half *>(y), incy,
-        reinterpret_cast<sycl::half *>(result));
-    break;
-  }
-  default:
-    throw std::runtime_error("the combination of data type is unsupported");
-  }
-}
-
-template <class Tx, class Te>
-inline void scal_impl(sycl::queue &q, int n, const void *alpha, void *x,
-                         int incx) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  Te alpha_val = dpct::get_value(reinterpret_cast<const Te *>(alpha), q);
-  auto data_x = get_memory<Tx>(x);
-  oneapi::mkl::blas::column_major::scal(q, n, alpha_val,
-                                        data_x, incx);
-#endif
-}
-
-template <class Txy, class Te>
-inline void axpy_impl(sycl::queue &q, int n, const void *alpha, const void *x,
-                        int incx, void *y, int incy) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  Te alpha_val = dpct::get_value(reinterpret_cast<const Te *>(alpha), q);
-  auto data_x = get_memory<const Txy>(x);
-  auto data_y = get_memory<Txy>(y);
-  oneapi::mkl::blas::column_major::axpy(q, n, alpha_val,
-                                        data_x, incx,
-                                        data_y, incy);
-#endif
-}
-
-template <class Txy, class Tc, class Ts>
-inline void rot_impl(sycl::queue &q, int n, void *x, int incx, void *y,
-                        int incy, const void *c, const void *s) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  Tc c_value = dpct::get_value(reinterpret_cast<const Tc *>(c), q);
-  Ts s_value = dpct::get_value(reinterpret_cast<const Ts *>(s), q);
-  auto data_x = get_memory<Txy>(x);
-  auto data_y = get_memory<Txy>(y);
-  oneapi::mkl::blas::column_major::rot(q, n, data_x, incx,
-                                       data_y, incy, c_value,
-                                       s_value);
-#endif
-}
-
-template <class Ta, class Tb, class Tc, class Ts>
-inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                         oneapi::mkl::transpose b_trans, int m, int n, int k,
-                         const void *alpha, const void *a, int lda, const void *b,
-                         int ldb, const void *beta, void *c, int ldc) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-  Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-  auto data_a = get_memory<const Ta>(a);
-  auto data_b = get_memory<const Tb>(b);
-  auto data_c = get_memory<Tc>(c);
-  oneapi::mkl::blas::column_major::gemm(
-      q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-      data_b, ldb, beta_value, data_c, ldc);
-#endif
-}
-
-template <class Ta, class Tb, class Tc, class Ts>
-inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                            oneapi::mkl::transpose b_trans, int m, int n, int k,
-                            const void *alpha, const void **a, int lda,
-                            const void **b, int ldb, const void *beta, void **c,
-                            int ldc, int batch_size) {
-  struct matrix_info_t {
-    oneapi::mkl::transpose transpose_info[2];
-    Ts value_info[2];
-    std::int64_t size_info[3];
-    std::int64_t ld_info[3];
-    std::int64_t groupsize_info;
-  };
-
-  Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-  Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-
-  matrix_info_t *matrix_info =
-      (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
-  matrix_info->transpose_info[0] = a_trans;
-  matrix_info->transpose_info[1] = b_trans;
-  matrix_info->value_info[0] = alpha_value;
-  matrix_info->value_info[1] = beta_value;
-  matrix_info->size_info[0] = m;
-  matrix_info->size_info[1] = n;
-  matrix_info->size_info[2] = k;
-  matrix_info->ld_info[0] = lda;
-  matrix_info->ld_info[1] = ldb;
-  matrix_info->ld_info[2] = ldc;
-  matrix_info->groupsize_info = batch_size;
-
-  sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
-      q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
-      matrix_info->size_info, matrix_info->size_info + 1,
-      matrix_info->size_info + 2, matrix_info->value_info,
-      reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
-      reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
-      matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
-      matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
-
-  q.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(e);
-    cgh.host_task([=] { std::free(matrix_info); });
-  });
-}
-
-template <class Ta, class Tb, class Tc, class Ts>
-inline void
-gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                    oneapi::mkl::transpose b_trans, int m, int n,
-                    int k, const void *alpha, const void *a, int lda,
-                    long long int stride_a, const void *b, int ldb,
-                    long long int stride_b, const void *beta, void *c,
-                    int ldc, long long int stride_c, int batch_size) {
-  Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-  Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-  auto data_a = get_memory<const Ta>(a);
-  auto data_b = get_memory<const Tb>(b);
-  auto data_c = get_memory<Tc>(c);
-  oneapi::mkl::blas::column_major::gemm_batch(
-      q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-      stride_a, data_b, ldb, stride_b, beta_value,
-      data_c, ldc, stride_c, batch_size);
-}
-
-template <bool is_hermitian, class T, class Tbeta>
-inline void rk_impl(sycl::queue &q, oneapi::mkl::uplo uplo,
-                          oneapi::mkl::transpose trans, int n, int k,
-                          const T *alpha, const T *a, int lda, const T *b,
-                          int ldb, const Tbeta *beta, T *c, int ldc) {
-  // For symmetric matrix, this function performs: C = alpha*OP(A)*(OP(B))^T + beta*C
-  // For Hermitian matrix, this function performs: C = alpha*OP(A)*(OP(B))^H + beta*C
-  // The gemmt() function performs: C = alpha*OPA(A)*OPB(B) + beta*C
-  // So the OPB need be updated before we call gemmt().
-  using Ty = typename dpct::DataType<T>::T2;
-  using Ts = typename dpct::DataType<Tbeta>::T2;
-  Ty alpha_value = dpct::get_value(reinterpret_cast<const Ty *>(alpha), q);
-  Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-  oneapi::mkl::transpose trans_A = trans, trans_B = trans;
-  int origin_b_rows = trans == oneapi::mkl::transpose::nontrans ? n : k;
-  int origin_b_cols = trans == oneapi::mkl::transpose::nontrans ? k : n;
-
-  if ((is_hermitian && trans == oneapi::mkl::transpose::trans) ||
-      (!is_hermitian && !std::is_floating_point_v<Ty> && trans == oneapi::mkl::transpose::conjtrans)) {
-    // In this case, OPB need be a conjugate operation,
-    // but only notrans, conjtrans and trans are available.
-    // So we need do a conjtrans operation first, then do a trans operation.
-    trans_B = oneapi::mkl::transpose::trans;
-    auto data_a = get_memory<const Ty>(a);
-    auto data_c = get_memory<Ty>(c);
-#ifdef DPCT_USM_LEVEL_NONE
-    auto new_B_buffer = sycl::buffer<Ty, 1>(sycl::range<1>(origin_b_rows * origin_b_cols));
-    auto from_buffer = dpct::get_buffer<Ty>(b);
-    oneapi::mkl::blas::column_major::omatcopy_batch(
-          q, oneapi::mkl::transpose::conjtrans, origin_b_rows, origin_b_cols,
-          Ts(1.0), from_buffer, ldb, origin_b_rows * ldb, new_B_buffer,
-          origin_b_cols, origin_b_rows * origin_b_cols, 1);
-    oneapi::mkl::blas::column_major::gemmt(
-        q, uplo, trans_A, trans_B, n, k, alpha_value,
-        data_a, lda, new_B_buffer, origin_b_cols, beta_value, data_c, ldc);
-#else
-    working_memory<T> new_B(origin_b_rows * origin_b_cols * sizeof(T), q);
-    oneapi::mkl::blas::column_major::omatcopy_batch(
-        q, oneapi::mkl::transpose::conjtrans, origin_b_rows, origin_b_cols,
-        Ts(1.0), reinterpret_cast<const Ty *>(b), ldb, origin_b_rows * ldb,
-        reinterpret_cast<Ty *>(new_B.get_ptr()), origin_b_cols,
-        origin_b_rows * origin_b_cols, 1);
-    sycl::event e = oneapi::mkl::blas::column_major::gemmt(
-        q, uplo, trans_A, trans_B, n, k, alpha_value,
-        data_a, lda, reinterpret_cast<Ty *>(new_B.get_ptr()), origin_b_cols,
-        beta_value, data_c, ldc);
-    new_B.set_event(e);
-#endif
-  } else {
-    if constexpr (is_hermitian) {
-      trans_B = trans == oneapi::mkl::transpose::nontrans
-                  ? oneapi::mkl::transpose::conjtrans
-                  : oneapi::mkl::transpose::nontrans;
-    } else {
-      trans_B = trans == oneapi::mkl::transpose::nontrans
-                  ? oneapi::mkl::transpose::trans
-                  : oneapi::mkl::transpose::nontrans;
-    }
-    auto data_a = get_memory<const Ty>(a);
-    auto data_b = get_memory<const Ty>(b);
-    auto data_c = get_memory<Ty>(c);
-    oneapi::mkl::blas::column_major::gemmt(
-        q, uplo, trans_A, trans_B, n, k, alpha_value,
-        data_a, lda, data_b, ldb, beta_value, data_c, ldc);
-  }
-}
-
-template <class Ta, class Tb, class Ts>
-inline void
-trsm_batch_impl(sycl::queue &q, oneapi::mkl::side left_right,
-                oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                oneapi::mkl::diag unit_diag, int m, int n, const void *alpha,
-                const void **a, int lda, void **b, int ldb, int batch_size) {
-  struct matrix_info_t {
-    matrix_info_t(oneapi::mkl::side side_info, oneapi::mkl::uplo uplo_info,
-                  oneapi::mkl::transpose transpose_info,
-                  oneapi::mkl::diag diag_info, Ts value_info, std::int64_t m,
-                  std::int64_t n, std::int64_t lda, std::int64_t ldb,
-                  std::int64_t groupsize_info)
-        : side_info(side_info), uplo_info(uplo_info),
-          transpose_info(transpose_info), diag_info(diag_info),
-          value_info(value_info), groupsize_info(groupsize_info) {
-      size_info[0] = m;
-      size_info[1] = n;
-      ld_info[0] = lda;
-      ld_info[1] = ldb;
-    }
-    oneapi::mkl::side side_info;
-    oneapi::mkl::uplo uplo_info;
-    oneapi::mkl::transpose transpose_info;
-    oneapi::mkl::diag diag_info;
-    Ts value_info;
-    std::int64_t size_info[2];
-    std::int64_t ld_info[2];
-    std::int64_t groupsize_info;
-  };
-
-  Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-
-  matrix_info_t *matrix_info =
-      new matrix_info_t(left_right, upper_lower, trans, unit_diag, alpha_value,
-                        m, n, lda, ldb, batch_size);
-
-  sycl::event e = oneapi::mkl::blas::column_major::trsm_batch(
-      q, &(matrix_info->side_info), &(matrix_info->uplo_info),
-      &(matrix_info->transpose_info), &(matrix_info->diag_info),
-      matrix_info->size_info, matrix_info->size_info + 1,
-      &(matrix_info->value_info), reinterpret_cast<const Ta **>(a),
-      matrix_info->ld_info, reinterpret_cast<Tb **>(b),
-      matrix_info->ld_info + 1, 1, &(matrix_info->groupsize_info));
-
-  q.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(e);
-    cgh.host_task([=] { delete matrix_info; });
-  });
-}
-
-template <typename T>
-inline void getrfnp_batch_wrapper(sycl::queue &exec_queue, int n, T *a[],
-                                  int lda, int *info, int batch_size) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  using Ty = typename DataType<T>::T2;
-  // Set the info array value to 0
-  detail::dpct_memset<unsigned char>(exec_queue, info, 0, sizeof(int) * batch_size);
-  std::int64_t stride_a = n * lda;
-  std::int64_t scratchpad_size =
-      oneapi::mkl::lapack::getrfnp_batch_scratchpad_size<Ty>(
-          exec_queue, n, n, lda, stride_a, batch_size);
-
-  Ty *a_strided_mem =
-      (Ty *)dpct::dpct_malloc(stride_a * batch_size * sizeof(Ty), exec_queue);
-  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
-  dpct::dpct_memcpy(host_a, a, batch_size * sizeof(T *));
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    dpct::dpct_memcpy(a_strided_mem + i * stride_a, host_a[i],
-                      n * lda * sizeof(T));
-
-#ifdef DPCT_USM_LEVEL_NONE
-  {
-    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
-    auto a_buffer = get_buffer<Ty>(a_strided_mem);
-    oneapi::mkl::lapack::getrfnp_batch(exec_queue, n, n, a_buffer, lda,
-                                       stride_a, batch_size, scratchpad,
-                                       scratchpad_size);
-  }
-  std::vector<sycl::event> events;
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    events.push_back(detail::dpct_memcpy(exec_queue, host_a[i],
-                                         a_strided_mem + i * stride_a,
-                                         n * lda * sizeof(T), automatic));
-#else
-  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
-  sycl::event e = oneapi::mkl::lapack::getrfnp_batch(
-      exec_queue, n, n, a_strided_mem, lda, stride_a, batch_size, scratchpad,
-      scratchpad_size);
-  std::vector<sycl::event> events;
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    events.push_back(detail::dpct_memcpy(exec_queue, host_a[i],
-                                         a_strided_mem + i * stride_a,
-                                         n * lda * sizeof(T), automatic, {e}));
-
-  std::vector<void *> ptrs{scratchpad, a_strided_mem};
-  dpct::async_dpct_free(ptrs, events, exec_queue);
-#endif
-
-  exec_queue.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(events);
-    cgh.host_task([=] { std::free(host_a); });
-  });
-#endif
-}
-
-} // namespace detail
-
-inline oneapi::mkl::transpose get_transpose(int t) {
-  if (t == 0) {
-    return oneapi::mkl::transpose::nontrans;
-  } else if (t == 1) {
-    return oneapi::mkl::transpose::trans;
-  } else {
-    return oneapi::mkl::transpose::conjtrans;
-  }
-}
-
-/// Computes the LU factorizations of a batch of general matrices.
-/// \param [in] exec_queue The queue where the routine should be executed.
-/// \param [in] n The order of the matrices.
-/// \param [in, out] a Array of pointers to matrices. These matrices will be
-/// overwritten by lower triangulars with unit diagonal elements and upper
-/// triangulars.
-/// \param [in] lda The leading dimension of the matrices.
-/// \param [out] ipiv An array stores the pivot indices. If \p ipiv is nullptr,
-/// non-pivoting LU factorization is computed.
-/// \param [out] info An array stores the error information.
-/// \param [in] batch_size The size of the batch.
-template <typename T>
-inline void getrf_batch_wrapper(sycl::queue &exec_queue, int n, T *a[],
-                                int lda, int *ipiv, int *info, int batch_size) {
-  if (ipiv == nullptr) {
-    detail::getrfnp_batch_wrapper(exec_queue, n, a, lda, info, batch_size);
-    return;
-  }
-  using Ty = typename DataType<T>::T2;
-  // Set the info array value to 0
-  detail::dpct_memset<unsigned char>(exec_queue, info, 0, sizeof(int) * batch_size);
-#ifdef DPCT_USM_LEVEL_NONE
-  std::int64_t stride_a = n * lda;
-  std::int64_t stride_ipiv = n;
-  std::int64_t scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<Ty>(
-      exec_queue, n, n, lda, stride_a, stride_ipiv, batch_size);
-
-  T *a_buffer_ptr;
-  a_buffer_ptr = (T *)dpct_malloc(stride_a * batch_size * sizeof(T));
-
-  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
-  dpct_memcpy(host_a, a, batch_size * sizeof(T *));
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    dpct_memcpy(a_buffer_ptr + i * stride_a, host_a[i], n * lda * sizeof(T));
-
-  {
-    sycl::buffer<std::int64_t, 1> ipiv_buf(
-        sycl::range<1>(batch_size * stride_ipiv));
-    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
-    auto a_buffer = get_buffer<Ty>(a_buffer_ptr);
-    oneapi::mkl::lapack::getrf_batch(exec_queue, n, n, a_buffer, lda, stride_a,
-                             ipiv_buf, stride_ipiv, batch_size, scratchpad,
-                             scratchpad_size);
-
-    auto to_buffer = get_buffer<int>(ipiv);
-    exec_queue.submit([&](sycl::handler &cgh) {
-      auto from_acc = ipiv_buf.get_access<sycl::access_mode::read>(cgh);
-      auto to_acc = to_buffer.get_access<sycl::access_mode::write>(cgh);
-      cgh.parallel_for<dpct_kernel_name<class getrf_device_int64_to_int, T>>(
-          sycl::range<2>(batch_size, n), [=](sycl::id<2> id) {
-            to_acc[id.get(0) * n + id.get(1)] =
-                static_cast<int>(from_acc[id.get(0) * stride_ipiv + id.get(1)]);
-          });
-    });
-  }
-
-  // Copy back to the original buffers
-  std::vector<sycl::event> events;
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    events.push_back(detail::dpct_memcpy(exec_queue, host_a[i],
-                                         a_buffer_ptr + i * stride_a,
-                                         n * lda * sizeof(T), automatic));
-
-  std::vector<void *> ptrs{host_a};
-  std::thread mem_free_thread(
-      [=](std::vector<void *> pointers_array,
-          std::vector<sycl::event> events_array) {
-        sycl::event::wait(events_array);
-        for (auto p : pointers_array)
-          std::free(p);
-      },
-      ptrs, events);
-  mem_free_thread.detach();
-#else
-  std::int64_t m_int64 = n;
-  std::int64_t n_int64 = n;
-  std::int64_t lda_int64 = lda;
-  std::int64_t group_sizes = batch_size;
-  std::int64_t scratchpad_size = oneapi::mkl::lapack::getrf_batch_scratchpad_size<Ty>(
-      exec_queue, &m_int64, &n_int64, &lda_int64, 1, &group_sizes);
-
-  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
-  std::int64_t *ipiv_int64 =
-      sycl::malloc_device<std::int64_t>(batch_size * n, exec_queue);
-  std::int64_t **ipiv_int64_ptr =
-      sycl::malloc_shared<std::int64_t *>(batch_size, exec_queue);
-  T **a_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
-  exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *)).wait();
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    ipiv_int64_ptr[i] = ipiv_int64 + n * i;
-
-  oneapi::mkl::lapack::getrf_batch(exec_queue, &m_int64, &n_int64, (Ty **)a_shared, &lda_int64,
-                           ipiv_int64_ptr, 1, &group_sizes, scratchpad,
-                           scratchpad_size);
-
-  sycl::event e = exec_queue.submit([&](sycl::handler &cgh) {
-    cgh.parallel_for<dpct_kernel_name<class getrf_device_int64_to_int, T>>(
-        sycl::range<1>(batch_size * n), [=](sycl::id<1> idx) {
-          ipiv[idx] = static_cast<int>(ipiv_int64[idx]);
-        });
-  });
-
-  std::vector<void *> ptrs{scratchpad, ipiv_int64, ipiv_int64_ptr, a_shared};
-  async_dpct_free(ptrs, {e}, exec_queue);
-#endif
-}
-
-/// Solves a system of linear equations with a batch of LU-factored square
-/// coefficient matrices, with multiple right-hand sides.
-/// \param [in] exec_queue The queue where the routine should be executed.
-/// \param [in] trans Indicates the form of the linear equations.
-/// \param [in] n The order of the matrices.
-/// \param [in] nrhs The number of right hand sides.
-/// \param [in] a Array of pointers to matrices.
-/// \param [in] lda The leading dimension of the matrices in \p a.
-/// \param [in] ipiv An array stores the pivots.
-/// \param [in, out] b Array of pointers to matrices, whose columns are
-/// the right-hand sides for the systems of equations.
-/// \param [in] ldb The leading dimension of the matrices in \p b.
-/// \param [out] info A value stores the error information.
-/// \param [in] batch_size The size of the batch.
-template <typename T>
-inline void getrs_batch_wrapper(sycl::queue &exec_queue,
-                                oneapi::mkl::transpose trans, int n, int nrhs,
-                                const T *a[], int lda, const int *ipiv, T *b[],
-                                int ldb, int *info, int batch_size) {
-  using Ty = typename DataType<T>::T2;
-  // Set the info value to 0
-  *info = 0;
-#ifdef DPCT_USM_LEVEL_NONE
-  std::int64_t stride_a = n * lda;
-  std::int64_t stride_b = nrhs * ldb;
-  std::int64_t stride_ipiv = n;
-  std::int64_t scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<Ty>(
-      exec_queue, trans, n, nrhs, lda, stride_a, stride_ipiv, ldb, stride_b,
-      batch_size);
-
-  T *a_buffer_ptr, *b_buffer_ptr;
-  a_buffer_ptr = (T *)dpct_malloc(stride_a * batch_size * sizeof(T));
-  b_buffer_ptr = (T *)dpct_malloc(stride_b * batch_size * sizeof(T));
-
-  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
-  T **host_b = (T **)std::malloc(batch_size * sizeof(T *));
-  dpct_memcpy(host_a, a, batch_size * sizeof(T *));
-  dpct_memcpy(host_b, b, batch_size * sizeof(T *));
-  for (std::int64_t i = 0; i < batch_size; ++i) {
-    dpct_memcpy(a_buffer_ptr + i * stride_a, host_a[i], n * lda * sizeof(T));
-    dpct_memcpy(b_buffer_ptr + i * stride_b, host_b[i], nrhs * ldb * sizeof(T));
-  }
-
-  {
-    auto a_buffer = get_buffer<Ty>(a_buffer_ptr);
-    auto b_buffer = get_buffer<Ty>(b_buffer_ptr);
-    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
-    sycl::buffer<std::int64_t, 1> ipiv_buf(
-        sycl::range<1>(batch_size * stride_ipiv));
-    auto from_buf = get_buffer<int>(ipiv);
-    exec_queue.submit([&](sycl::handler &cgh) {
-      auto from_acc = from_buf.get_access<sycl::access_mode::read>(cgh);
-      auto to_acc = ipiv_buf.get_access<sycl::access_mode::write>(cgh);
-      cgh.parallel_for<dpct_kernel_name<class getrs_device_int64_to_int, T>>(
-          sycl::range<2>(batch_size, n), [=](sycl::id<2> id) {
-            to_acc[id.get(0) * stride_ipiv + id.get(1)] =
-                static_cast<std::int64_t>(from_acc[id.get(0) * n + id.get(1)]);
-          });
-    });
-
-    oneapi::mkl::lapack::getrs_batch(exec_queue, trans, n, nrhs, a_buffer, lda,
-                             stride_a, ipiv_buf, stride_ipiv, b_buffer, ldb,
-                             stride_b, batch_size, scratchpad, scratchpad_size);
-  }
-
-  // Copy back to the original buffers
-  std::vector<sycl::event> events;
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    events.push_back(detail::dpct_memcpy(exec_queue, host_b[i],
-                                         b_buffer_ptr + i * stride_b,
-                                         nrhs * ldb * sizeof(T), automatic));
-  std::vector<void *> ptrs{host_a, host_b};
-  std::thread mem_free_thread(
-      [=](std::vector<void *> pointers_array,
-          std::vector<sycl::event> events_array) {
-        sycl::event::wait(events_array);
-        for (auto p : pointers_array)
-          std::free(p);
-      },
-      ptrs, events);
-  mem_free_thread.detach();
-#else
-  std::int64_t n_int64 = n;
-  std::int64_t nrhs_int64 = nrhs;
-  std::int64_t lda_int64 = lda;
-  std::int64_t ldb_int64 = ldb;
-  std::int64_t group_sizes = batch_size;
-  std::int64_t scratchpad_size = oneapi::mkl::lapack::getrs_batch_scratchpad_size<Ty>(
-      exec_queue, &trans, &n_int64, &nrhs_int64, &lda_int64, &ldb_int64, 1,
-      &group_sizes);
-
-  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
-  std::int64_t *ipiv_int64 =
-      sycl::malloc_device<std::int64_t>(batch_size * n, exec_queue);
-  std::int64_t **ipiv_int64_ptr =
-      sycl::malloc_shared<std::int64_t *>(batch_size, exec_queue);
-  T **a_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
-  T **b_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
-  exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *));
-  exec_queue.memcpy(b_shared, b, batch_size * sizeof(T *));
-
-  exec_queue.submit([&](sycl::handler &cgh) {
-    cgh.parallel_for<dpct_kernel_name<class getrs_device_int64_to_int, T>>(
-        sycl::range<1>(batch_size * n), [=](sycl::id<1> idx) {
-          ipiv_int64[idx] = static_cast<std::int64_t>(ipiv[idx]);
-        });
-  }).wait();
-
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    ipiv_int64_ptr[i] = ipiv_int64 + n * i;
-
-  sycl::event e = oneapi::mkl::lapack::getrs_batch(
-      exec_queue, &trans, &n_int64, &nrhs_int64, (Ty **)a_shared, &lda_int64,
-      ipiv_int64_ptr, (Ty **)b_shared, &ldb_int64, 1, &group_sizes, scratchpad,
-      scratchpad_size);
-
-  std::vector<void *> ptrs{scratchpad, ipiv_int64_ptr, ipiv_int64, a_shared, b_shared};
-  async_dpct_free(ptrs, {e}, exec_queue);
-#endif
-}
-
-/// Computes the inverses of a batch of LU-factored matrices.
-/// \param [in] exec_queue The queue where the routine should be executed.
-/// \param [in] n The order of the matrices.
-/// \param [in] a Array of pointers to matrices.
-/// \param [in] lda The leading dimension of the matrices in \p a.
-/// \param [in] ipiv An array stores the pivots.
-/// \param [out] b Array of pointers to inverse matrices.
-/// \param [in] ldb The leading dimension of the matrices in \p b.
-/// \param [out] info An array stores the error information.
-/// \param [in] batch_size The size of the batch.
-template <typename T>
-inline void getri_batch_wrapper(sycl::queue &exec_queue, int n,
-                                const T *a[], int lda, int *ipiv, T *b[],
-                                int ldb, int *info, int batch_size) {
-  using Ty = typename DataType<T>::T2;
-  // Set the info array value to 0
-  detail::dpct_memset<unsigned char>(exec_queue, info, 0, sizeof(int) * batch_size);
-#ifdef DPCT_USM_LEVEL_NONE
-  std::int64_t stride_b = n * ldb;
-  std::int64_t stride_ipiv = n;
-  std::int64_t scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<Ty>(
-      exec_queue, n, ldb, stride_b, stride_ipiv, batch_size);
-
-  T *b_buffer_ptr;
-  b_buffer_ptr = (T *)dpct_malloc(stride_b * batch_size * sizeof(T));
-
-  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
-  T **host_b = (T **)std::malloc(batch_size * sizeof(T *));
-  dpct_memcpy(host_a, a, batch_size * sizeof(T *));
-  dpct_memcpy(host_b, b, batch_size * sizeof(T *));
-
-  for (std::int64_t i = 0; i < batch_size; ++i) {
-    // Need to create a copy of input matrices "a" to keep them unchanged.
-    // Matrices "b" (copy of matrices "a") will be used as input and output
-    // parameter in oneapi::mkl::lapack::getri_batch call.
-    matrix_mem_copy(b_buffer_ptr + i * stride_b, host_a[i], ldb, lda, n, n,
-                    dpct::device_to_device, exec_queue);
-  }
-
-  {
-    auto b_buffer = get_buffer<Ty>(b_buffer_ptr);
-    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
-    sycl::buffer<std::int64_t, 1> ipiv_buf(
-        sycl::range<1>(batch_size * stride_ipiv));
-    auto from_buf = get_buffer<int>(ipiv);
-    exec_queue.submit([&](sycl::handler &cgh) {
-      auto from_acc = from_buf.get_access<sycl::access_mode::read>(cgh);
-      auto to_acc = ipiv_buf.get_access<sycl::access_mode::write>(cgh);
-      cgh.parallel_for<dpct_kernel_name<class getri_device_int64_to_int, T>>(
-          sycl::range<2>(batch_size, n), [=](sycl::id<2> id) {
-            to_acc[id.get(0) * stride_ipiv + id.get(1)] =
-                static_cast<std::int64_t>(from_acc[id.get(0) * n + id.get(1)]);
-          });
-    });
-
-    oneapi::mkl::lapack::getri_batch(exec_queue, n, b_buffer, ldb, stride_b, ipiv_buf,
-                             stride_ipiv, batch_size, scratchpad,
-                             scratchpad_size);
-  }
-
-  // Copy back to the original buffers
-  std::vector<sycl::event> events;
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    events.push_back(detail::dpct_memcpy(exec_queue, host_b[i],
-                                         b_buffer_ptr + i * stride_b,
-                                         n * ldb * sizeof(T), automatic));
-  std::vector<void *> ptrs{host_a, host_b};
-  std::thread mem_free_thread(
-      [=](std::vector<void *> pointers_array,
-          std::vector<sycl::event> events_array) {
-        sycl::event::wait(events_array);
-        for (auto p : pointers_array)
-          std::free(p);
-      },
-      ptrs, events);
-  mem_free_thread.detach();
-#else
-  std::int64_t n_int64 = n;
-  std::int64_t ldb_int64 = ldb;
-  std::int64_t group_sizes = batch_size;
-  std::int64_t scratchpad_size = oneapi::mkl::lapack::getri_batch_scratchpad_size<Ty>(
-      exec_queue, &n_int64, &ldb_int64, 1, &group_sizes);
-
-  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
-  std::int64_t *ipiv_int64 =
-      sycl::malloc_device<std::int64_t>(batch_size * n, exec_queue);
-  std::int64_t **ipiv_int64_ptr =
-      sycl::malloc_shared<std::int64_t *>(batch_size, exec_queue);
-
-  exec_queue.submit([&](sycl::handler &cgh) {
-    cgh.parallel_for<dpct_kernel_name<class getri_device_int64_to_int, T>>(
-        sycl::range<1>(batch_size * n), [=](sycl::id<1> idx) {
-          ipiv_int64[idx] = static_cast<std::int64_t>(ipiv[idx]);
-        });
-  });
-
-  T **a_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
-  T **b_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
-  exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *));
-  exec_queue.memcpy(b_shared, b, batch_size * sizeof(T *)).wait();
-  for (std::int64_t i = 0; i < batch_size; ++i) {
-    ipiv_int64_ptr[i] = ipiv_int64 + n * i;
-    // Need to create a copy of input matrices "a" to keep them unchanged.
-    // Matrices "b" (copy of matrices "a") will be used as input and output
-    // parameter in oneapi::mkl::lapack::getri_batch call.
-    matrix_mem_copy(b_shared[i], a_shared[i], ldb, lda, n, n, dpct::device_to_device,
-                    exec_queue);
-  }
-
-  sycl::event e = oneapi::mkl::lapack::getri_batch(
-      exec_queue, &n_int64, (Ty **)b_shared, &ldb_int64, ipiv_int64_ptr, 1,
-      &group_sizes, scratchpad, scratchpad_size);
-
-  std::vector<void *> ptrs{scratchpad, ipiv_int64_ptr, ipiv_int64, a_shared, b_shared};
-  async_dpct_free(ptrs, {e}, exec_queue);
-#endif
-}
-
-/// Computes the QR factorizations of a batch of general matrices.
-/// \param [in] exec_queue The queue where the routine should be executed.
-/// \param [in] m The number of rows in the matrices.
-/// \param [in] n The number of columns in the matrices.
-/// \param [in, out] a Array of pointers to matrices. These
-/// matrices will be overwritten by the factorization data.
-/// \param [in] lda The leading dimension of the matrices in \p a.
-/// \param [out] tau An array stores the scalars.
-/// \param [out] info A value stores the error information.
-/// \param [in] batch_size The size of the batch.
-template <typename T>
-inline void geqrf_batch_wrapper(sycl::queue exec_queue, int m, int n,
-                                T *a[], int lda, T *tau[], int *info,
-                                int batch_size) {
-  using Ty = typename DataType<T>::T2;
-  // Set the info value to 0
-  *info = 0;
-#ifdef DPCT_USM_LEVEL_NONE
-  std::int64_t stride_a = n * lda;
-  std::int64_t stride_tau = std::max(1, std::min(m, n));
-  std::int64_t scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<Ty>(
-      exec_queue, m, n, lda, stride_a, stride_tau, batch_size);
-
-  T *a_buffer_ptr, *tau_buffer_ptr;
-  a_buffer_ptr = (T *)dpct_malloc(stride_a * batch_size * sizeof(T));
-  tau_buffer_ptr = (T *)dpct_malloc(stride_tau * batch_size * sizeof(T));
-
-  T **host_a = (T **)std::malloc(batch_size * sizeof(T *));
-  T **host_tau = (T **)std::malloc(batch_size * sizeof(T *));
-  dpct_memcpy(host_a, a, batch_size * sizeof(T *));
-  dpct_memcpy(host_tau, tau, batch_size * sizeof(T *));
-
-  for (std::int64_t i = 0; i < batch_size; ++i)
-    dpct_memcpy(a_buffer_ptr + i * stride_a, host_a[i], n * lda * sizeof(T));
-  {
-    auto a_buffer = get_buffer<Ty>(a_buffer_ptr);
-    auto tau_buffer = get_buffer<Ty>(tau_buffer_ptr);
-    sycl::buffer<Ty, 1> scratchpad{sycl::range<1>(scratchpad_size)};
-    oneapi::mkl::lapack::geqrf_batch(exec_queue, m, n, a_buffer, lda, stride_a,
-                             tau_buffer, stride_tau, batch_size, scratchpad,
-                             scratchpad_size);
-  }
-
-  // Copy back to the original buffers
-  std::vector<sycl::event> events_a;
-  std::vector<sycl::event> events_tau;
-  for (std::int64_t i = 0; i < batch_size; ++i) {
-    events_a.push_back(detail::dpct_memcpy(exec_queue, host_a[i],
-                                           a_buffer_ptr + i * stride_a,
-                                           n * lda * sizeof(T), automatic));
-    events_tau.push_back(detail::dpct_memcpy(
-        exec_queue, host_tau[i], tau_buffer_ptr + i * stride_tau,
-        std::max(1, std::min(m, n)) * sizeof(T), automatic));
-  }
-  std::vector<void *> ptr_a{host_a};
-  std::vector<void *> ptr_tau{host_tau};
-  std::thread mem_free_thread_a(
-      [=](std::vector<void *> pointers_array,
-          std::vector<sycl::event> events_array) {
-        sycl::event::wait(events_array);
-        for (auto p : pointers_array)
-          std::free(p);
-      },
-      ptr_a, events_a);
-  std::thread mem_free_thread_tau(
-      [=](std::vector<void *> pointers_array,
-          std::vector<sycl::event> events_array) {
-        sycl::event::wait(events_array);
-        for (auto p : pointers_array)
-          std::free(p);
-      },
-      ptr_tau, events_tau);
-  mem_free_thread_a.detach();
-  mem_free_thread_tau.detach();
-#else
-  std::int64_t m_int64 = n;
-  std::int64_t n_int64 = n;
-  std::int64_t lda_int64 = lda;
-  std::int64_t group_sizes = batch_size;
-  std::int64_t scratchpad_size = oneapi::mkl::lapack::geqrf_batch_scratchpad_size<Ty>(
-      exec_queue, &m_int64, &n_int64, &lda_int64, 1, &group_sizes);
-
-  Ty *scratchpad = sycl::malloc_device<Ty>(scratchpad_size, exec_queue);
-  T **a_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
-  T **tau_shared = sycl::malloc_shared<T *>(batch_size, exec_queue);
-  exec_queue.memcpy(a_shared, a, batch_size * sizeof(T *));
-  exec_queue.memcpy(tau_shared, tau, batch_size * sizeof(T *)).wait();
-
-  sycl::event e = oneapi::mkl::lapack::geqrf_batch(
-      exec_queue, &m_int64, &n_int64, (Ty **)a_shared, &lda_int64, (Ty **)tau_shared, 1,
-      &group_sizes, scratchpad, scratchpad_size);
-
-  std::vector<void *> ptrs{scratchpad, a_shared, tau_shared};
-  async_dpct_free(ptrs, {e}, exec_queue);
-#endif
-}
-
-/// Computes the Euclidean norm of a vector.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] n Number of elements in vector x.
-/// \param [in] x Input vector x.
-/// \param [in] x_type Data type of the vector x.
-/// \param [in] incx Stride of vector x.
-/// \param [out] result The result scalar.
-/// \param [in] result_type Data type of the result.
-inline void nrm2(sycl::queue &q, int n, const void *x, library_data_t x_type,
-                    int incx, void *result, library_data_t result_type) {
-  std::uint64_t key = detail::get_type_combination_id(x_type, result_type);
-  switch (key) {
-  case detail::get_type_combination_id(library_data_t::real_float,
-                       library_data_t::real_float): {
-    detail::nrm2_impl<float, float>(q, n, x, incx, result);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_double,
-                       library_data_t::real_double): {
-    detail::nrm2_impl<double, double>(q, n, x, incx, result);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_float,
-                       library_data_t::real_float): {
-    detail::nrm2_impl<std::complex<float>, float>(
-        q, n, x, incx, result);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_double,
-                       library_data_t::real_double): {
-    detail::nrm2_impl<std::complex<double>, double>(
-        q, n, x, incx, result);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_half,
-                       library_data_t::real_half): {
-    detail::nrm2_impl<sycl::half, sycl::half>(
-        q, n, x, incx, result);
-    break;
-  }
-  default:
-    throw std::runtime_error("the combination of data type is unsupported");
-  }
-}
-
-/// Computes the dot product of two vectors.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] n Number of elements in vector x.
-/// \param [in] x Input vector x.
-/// \param [in] x_type Data type of the vector x.
-/// \param [in] incx Stride of vector x.
-/// \param [in] y Input vector y.
-/// \param [in] y_type Data type of the vector y.
-/// \param [in] incy Stride of vector y.
-/// \param [out] result The result scalar.
-/// \param [in] result_type Data type of the result.
-inline void dot(sycl::queue &q, int n, const void *x, library_data_t x_type,
-                   int incx, const void *y, library_data_t y_type, int incy,
-                   void *result, library_data_t result_type) {
-  detail::dotuc<false>(q, n, x, x_type, incx, y, y_type, incy, result,
-                          result_type);
-}
-
-/// Computes the dot product of two vectors, conjugating the first vector.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] n Number of elements in vector x.
-/// \param [in] x Input vector x.
-/// \param [in] x_type Data type of the vector x.
-/// \param [in] incx Stride of vector x.
-/// \param [in] y Input vector y.
-/// \param [in] y_type Data type of the vector y.
-/// \param [in] incy Stride of vector y.
-/// \param [out] result The result scalar.
-/// \param [in] result_type Data type of the result.
-inline void dotc(sycl::queue &q, int n, const void *x, library_data_t x_type,
-                    int incx, const void *y, library_data_t y_type, int incy,
-                    void *result, library_data_t result_type) {
-  detail::dotuc<true>(q, n, x, x_type, incx, y, y_type, incy, result,
-                         result_type);
-}
-
-/// Computes the product of a vector by a scalar.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] n Number of elements in vector x.
-/// \param [in] alpha The scale factor alpha.
-/// \param [in] alpha_type The data type of alpha.
-/// \param [in, out] x Input/Output vector x.
-/// \param [in] x_type Data type of the vector x.
-/// \param [in] incx Stride of vector x.
-inline void scal(sycl::queue &q, int n, const void *alpha,
-                    library_data_t alpha_type, void *x, library_data_t x_type,
-                    int incx) {
-  std::uint64_t key = detail::get_type_combination_id(x_type);
-  switch (key) {
-  case detail::get_type_combination_id(library_data_t::real_float): {
-    detail::scal_impl<float, float>(q, n, alpha, x, incx);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_double): {
-    detail::scal_impl<double, double>(q, n, alpha, x, incx);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_float): {
-    detail::scal_impl<std::complex<float>, std::complex<float>>(q, n, alpha,
-                                                                   x, incx);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_double): {
-    detail::scal_impl<std::complex<double>, std::complex<double>>(
-        q, n, alpha, x, incx);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_half): {
-    float alpha_value =
-        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-    sycl::half alaph_half(alpha_value);
-    detail::scal_impl<sycl::half, sycl::half>(q, n, &alaph_half, x, incx);
-    break;
-  }
-  default:
-    throw std::runtime_error("the combination of data type is unsupported");
-  }
-}
-
-/// Computes a vector-scalar product and adds the result to a vector.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] n Number of elements in vector x.
-/// \param [in] alpha The scale factor alpha.
-/// \param [in] alpha_type The data type of alpha.
-/// \param [in] x Input vector x.
-/// \param [in] x_type Data type of the vector x.
-/// \param [in] incx Stride of vector x.
-/// \param [in, out] y Input/Output vector y.
-/// \param [in] y_type Data type of the vector y.
-/// \param [in] incy Stride of vector y.
-inline void axpy(sycl::queue &q, int n, const void *alpha,
-                    library_data_t alpha_type, const void *x, library_data_t x_type,
-                    int incx, void *y, library_data_t y_type, int incy) {
-  std::uint64_t key = detail::get_type_combination_id(x_type, alpha_type);
-  switch (key) {
-  case detail::get_type_combination_id(library_data_t::real_float,
-                       library_data_t::real_float): {
-    detail::axpy_impl<float, float>(q, n, alpha, x, incx, y, incy);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_double,
-                       library_data_t::real_double): {
-    detail::axpy_impl<double, double>(q, n, alpha, x, incx, y, incy);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_float,
-                       library_data_t::complex_float): {
-    detail::axpy_impl<std::complex<float>, std::complex<float>>(
-        q, n, alpha, x, incx, y, incy);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_double,
-                       library_data_t::complex_double): {
-    detail::axpy_impl<std::complex<double>, std::complex<double>>(
-        q, n, alpha, x, incx, y, incy);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_half,
-                       library_data_t::real_float): {
-    float alpha_value =
-        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-    sycl::half alaph_half(alpha_value);
-    detail::axpy_impl<sycl::half, sycl::half>(q, n, &alaph_half, x, incx, y, incy);
-    break;
-  }
-  default:
-    throw std::runtime_error("the combination of data type is unsupported");
-  }
-}
-
-/// Performs rotation of points in the plane.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] n Number of elements in vector x.
-/// \param [in, out] x Input/Output vector x.
-/// \param [in] x_type Data type of the vector x.
-/// \param [in] incx Stride of vector x.
-/// \param [in, out] y Input/Output vector y.
-/// \param [in] y_type Data type of the vector y.
-/// \param [in] incy Stride of vector y.
-/// \param [in] c Scaling factor.
-/// \param [in] s Scaling factor.
-/// \param [in] cs_type Data type of the scaling factors.
-inline void rot(sycl::queue &q, int n, void *x, library_data_t x_type,
-                   int incx, void *y, library_data_t y_type, int incy,
-                   const void *c, const void *s, library_data_t cs_type) {
-  std::uint64_t key = detail::get_type_combination_id(x_type, cs_type);
-  switch (key) {
-  case detail::get_type_combination_id(library_data_t::real_float,
-                       library_data_t::real_float): {
-    detail::rot_impl<float, float, float>(q, n, x, incx, y, incy, c, s);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_double,
-                       library_data_t::real_double): {
-    detail::rot_impl<double, double, double>(q, n, x, incx, y, incy, c, s);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_float,
-                       library_data_t::real_float): {
-    detail::rot_impl<std::complex<float>, float, float>(q, n, x, incx, y, incy, c,
-                                                    s);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_double,
-                       library_data_t::real_double): {
-    detail::rot_impl<std::complex<double>, double, double>(q, n, x, incx, y, incy, c,
-                                                      s);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_float,
-                       library_data_t::complex_float): {
-    detail::rot_impl<std::complex<float>, float, std::complex<float>>(q, n, x, incx, y, incy, c, s);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_double,
-                       library_data_t::complex_double): {
-    detail::rot_impl<std::complex<double>, double, std::complex<double>>(q, n, x, incx, y, incy, c, s);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_half,
-                       library_data_t::real_half): {
-    detail::rot_impl<sycl::half, sycl::half, sycl::half>(q, n, x, incx, y, incy, c, s);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_bfloat16,
-                       library_data_t::real_bfloat16): {
-    detail::rot_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16>(q, n, x, incx, y, incy, c, s);
-    break;
-  }
-  default:
-    throw std::runtime_error("the combination of data type is unsupported");
-  }
-}
-
-/// Computes matrix-matrix product with general matrices.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] a_trans Specifies the operation applied to A.
-/// \param [in] b_trans Specifies the operation applied to B.
-/// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-/// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-/// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-/// \param [in] alpha Scaling factor for the matrix-matrix product.
-/// \param [in] a Input matrix A.
-/// \param [in] a_type Data type of the matrix A.
-/// \param [in] lda Leading dimension of A.
-/// \param [in] b Input matrix B.
-/// \param [in] b_type Data type of the matrix B.
-/// \param [in] ldb Leading dimension of B.
-/// \param [in] beta Scaling factor for matrix C.
-/// \param [in, out] c Input/Output matrix C.
-/// \param [in] c_type Data type of the matrix C.
-/// \param [in] ldc Leading dimension of C.
-/// \param [in] scaling_type Data type of the scaling factors.
-inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                 oneapi::mkl::transpose b_trans, int m, int n, int k,
-                 const void *alpha, const void *a, library_data_t a_type,
-                 int lda, const void *b, library_data_t b_type, int ldb,
-                 const void *beta, void *c, library_data_t c_type, int ldc,
-                 library_data_t scaling_type) {
-  bool matched = false;
-  if (scaling_type == library_data_t::real_float &&
-      c_type == library_data_t::complex_float) {
-    scaling_type = library_data_t::complex_float;
-  } else if (scaling_type == library_data_t::real_double &&
-             c_type == library_data_t::complex_double) {
-    scaling_type = library_data_t::complex_double;
-  }
-
-  std::uint64_t key =
-      detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-  switch (key) {
-  case detail::get_type_combination_id(
-      library_data_t::real_float, library_data_t::real_float,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_impl<float, float, float, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_double, library_data_t::real_double,
-      library_data_t::real_double, library_data_t::real_double): {
-    detail::gemm_impl<double, double, double, double>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::complex_float, library_data_t::complex_float,
-      library_data_t::complex_float, library_data_t::complex_float): {
-    detail::gemm_impl<std::complex<float>, std::complex<float>,
-                      std::complex<float>, std::complex<float>>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::complex_double, library_data_t::complex_double,
-      library_data_t::complex_double, library_data_t::complex_double): {
-    detail::gemm_impl<std::complex<double>, std::complex<double>,
-                      std::complex<double>, std::complex<double>>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_half, library_data_t::real_half,
-      library_data_t::real_half, library_data_t::real_half): {
-    detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                      sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
-                                      lda, b, ldb, beta, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                      float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
-                             ldb, beta, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_half, library_data_t::real_half,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_impl<sycl::half, sycl::half, float, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_half, library_data_t::real_half,
-      library_data_t::real_half, library_data_t::real_float): {
-    float alpha_value =
-        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-    float beta_value =
-        dpct::get_value(reinterpret_cast<const float *>(beta), q);
-    sycl::half alpha_half(alpha_value);
-    sycl::half beta_half(beta_value);
-    detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                      sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
-                                      a, lda, b, ldb, &beta_half, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_int8, library_data_t::real_int8,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-      library_data_t::real_bfloat16, library_data_t::real_float): {
-    detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                      oneapi::mkl::bfloat16, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_int8, library_data_t::real_int8,
-      library_data_t::real_int32, library_data_t::real_int32): {
-    float alpha_float =
-        dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-    float beta_float =
-        dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-    detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
-        q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
-    break;
-  }
-  default:
-    throw std::runtime_error("the combination of data type is unsupported");
-  }
-}
-
-/// Computes a batch of matrix-matrix product with general matrices.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] a_trans Specifies the operation applied to A.
-/// \param [in] b_trans Specifies the operation applied to B.
-/// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-/// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-/// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-/// \param [in] alpha Scaling factor for the matrix-matrix product.
-/// \param [in] a Input matrix A.
-/// \param [in] a_type Data type of the matrix A.
-/// \param [in] lda Leading dimension of A.
-/// \param [in] b Input matrix B.
-/// \param [in] b_type Data type of the matrix B.
-/// \param [in] ldb Leading dimension of B.
-/// \param [in] beta Scaling factor for matrix C.
-/// \param [in, out] c Input/Output matrix C.
-/// \param [in] c_type Data type of the matrix C.
-/// \param [in] ldc Leading dimension of C.
-/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-/// \param [in] scaling_type Data type of the scaling factors.
-inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                       oneapi::mkl::transpose b_trans, int m, int n, int k,
-                       const void *alpha, const void *a[],
-                       library_data_t a_type, int lda, const void *b[],
-                       library_data_t b_type, int ldb, const void *beta,
-                       void *c[], library_data_t c_type, int ldc,
-                       int batch_size, library_data_t scaling_type) {
-#ifdef DPCT_USM_LEVEL_NONE
-  throw std::runtime_error("this API is unsupported when USM level is none");
-#else
-  bool matched = false;
-  if (scaling_type == library_data_t::real_float &&
-      c_type == library_data_t::complex_float) {
-    scaling_type = library_data_t::complex_float;
-  } else if (scaling_type == library_data_t::real_double &&
-             c_type == library_data_t::complex_double) {
-    scaling_type = library_data_t::complex_double;
-  }
-
-  std::uint64_t key =
-      detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-  switch (key) {
-  case detail::get_type_combination_id(
-      library_data_t::real_float, library_data_t::real_float,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_batch_impl<float, float, float, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_double, library_data_t::real_double,
-      library_data_t::real_double, library_data_t::real_double): {
-    detail::gemm_batch_impl<double, double, double, double>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::complex_float, library_data_t::complex_float,
-      library_data_t::complex_float, library_data_t::complex_float): {
-    detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                            std::complex<float>, std::complex<float>>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::complex_double, library_data_t::complex_double,
-      library_data_t::complex_double, library_data_t::complex_double): {
-    detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                            std::complex<double>, std::complex<double>>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_half, library_data_t::real_half,
-      library_data_t::real_half, library_data_t::real_half): {
-    detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                            sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                            a, lda, b, ldb, beta, c, ldc,
-                                            batch_size);
-    break;
-  }
-#ifdef __INTEL_MKL__
-  case detail::get_type_combination_id(
-      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-      library_data_t::real_bfloat16, library_data_t::real_float): {
-    detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                            oneapi::mkl::bfloat16, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                            float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                   b, ldb, beta, c, ldc, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_int8, library_data_t::real_int8,
-      library_data_t::real_int32, library_data_t::real_int32): {
-    float alpha_float =
-        dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-    float beta_float =
-        dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-    detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                            float>(q, a_trans, b_trans, m, n, k, &alpha_float,
-                                          a, lda, b, ldb, &beta_float, c, ldc,
-                                          batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_int8, library_data_t::real_int8,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_half, library_data_t::real_half,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-        batch_size);
-    break;
-  }
-#endif
-  case detail::get_type_combination_id(
-      library_data_t::real_half, library_data_t::real_half,
-      library_data_t::real_half, library_data_t::real_float): {
-    float alpha_value =
-        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-    float beta_value =
-        dpct::get_value(reinterpret_cast<const float *>(beta), q);
-    sycl::half alpha_half(alpha_value);
-    sycl::half beta_half(beta_value);
-    detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-        q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
-        batch_size);
-    break;
-  }
-  default:
-    throw std::runtime_error("the combination of data type is unsupported");
-  }
-#endif
-}
-
-/// Computes a batch of matrix-matrix product with general matrices.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] a_trans Specifies the operation applied to A.
-/// \param [in] b_trans Specifies the operation applied to B.
-/// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-/// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-/// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-/// \param [in] alpha Scaling factor for the matrix-matrix product.
-/// \param [in] a Input matrix A.
-/// \param [in] a_type Data type of the matrix A.
-/// \param [in] lda Leading dimension of A.
-/// \param [in] stride_a Stride between the different A matrices.
-/// \param [in] b Input matrix B.
-/// \param [in] b_type Data type of the matrix B.
-/// \param [in] ldb Leading dimension of B.
-/// \param [in] stride_b Stride between the different B matrices.
-/// \param [in] beta Scaling factor for matrix C.
-/// \param [in, out] c Input/Output matrix C.
-/// \param [in] c_type Data type of the matrix C.
-/// \param [in] ldc Leading dimension of C.
-/// \param [in] stride_c Stride between the different C matrices.
-/// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-/// \param [in] scaling_type Data type of the scaling factors.
-inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                       oneapi::mkl::transpose b_trans, int m, int n, int k,
-                       const void *alpha, const void *a, library_data_t a_type,
-                       int lda, long long int stride_a, const void *b,
-                       library_data_t b_type, int ldb, long long int stride_b,
-                       const void *beta, void *c, library_data_t c_type,
-                       int ldc, long long int stride_c, int batch_size,
-                       library_data_t scaling_type) {
-  bool matched = false;
-  if (scaling_type == library_data_t::real_float &&
-      c_type == library_data_t::complex_float) {
-    scaling_type = library_data_t::complex_float;
-  } else if (scaling_type == library_data_t::real_double &&
-             c_type == library_data_t::complex_double) {
-    scaling_type = library_data_t::complex_double;
-  }
-
-  std::uint64_t key =
-      detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-  switch (key) {
-  case detail::get_type_combination_id(
-      library_data_t::real_float, library_data_t::real_float,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_batch_impl<float, float, float, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_double, library_data_t::real_double,
-      library_data_t::real_double, library_data_t::real_double): {
-    detail::gemm_batch_impl<double, double, double, double>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::complex_float, library_data_t::complex_float,
-      library_data_t::complex_float, library_data_t::complex_float): {
-    detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                            std::complex<float>, std::complex<float>>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::complex_double, library_data_t::complex_double,
-      library_data_t::complex_double, library_data_t::complex_double): {
-    detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                            std::complex<double>, std::complex<double>>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_half, library_data_t::real_half,
-      library_data_t::real_half, library_data_t::real_half): {
-    detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                            sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                            a, lda, stride_a, b, ldb, stride_b,
-                                            beta, c, ldc, stride_c, batch_size);
-    break;
-  }
-#ifdef __INTEL_MKL__
-  case detail::get_type_combination_id(
-      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-      library_data_t::real_bfloat16, library_data_t::real_float): {
-    detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                            oneapi::mkl::bfloat16, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                            float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                   stride_a, b, ldb, stride_b, beta, c, ldc,
-                                   stride_c, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_int8, library_data_t::real_int8,
-      library_data_t::real_int32, library_data_t::real_int32): {
-    detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                            std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
-                                          a, lda, stride_a, b, ldb, stride_b,
-                                          beta, c, ldc, stride_c, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_int8, library_data_t::real_int8,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(
-      library_data_t::real_half, library_data_t::real_half,
-      library_data_t::real_float, library_data_t::real_float): {
-    detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-        q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-        beta, c, ldc, stride_c, batch_size);
-    break;
-  }
-#endif
-  case detail::get_type_combination_id(
-      library_data_t::real_half, library_data_t::real_half,
-      library_data_t::real_half, library_data_t::real_float): {
-    float alpha_value =
-        dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-    float beta_value =
-        dpct::get_value(reinterpret_cast<const float *>(beta), q);
-    sycl::half alpha_half(alpha_value);
-    sycl::half beta_half(beta_value);
-    detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-        q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
-        &beta_half, c, ldc, stride_c, batch_size);
-    break;
-  }
-  default:
-    throw std::runtime_error("the combination of data type is unsupported");
-  }
-}
-
-/// This routines perform a special rank-k update of a symmetric matrix C by
-/// general matrices A and B.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] uplo Specifies whether C's data is stored in its upper or lower triangle.
-/// \param [in] trans Specifies the operation to apply.
-/// \param [in] n The number of rows and columns in C.
-/// \param [in] k The inner dimension of matrix multiplications.
-/// \param [in] alpha Scaling factor for the rank-k update.
-/// \param [in] a Input matrix A.
-/// \param [in] lda Leading dimension of A.
-/// \param [in] b Input matrix B.
-/// \param [in] ldb Leading dimension of B.
-/// \param [in] beta Scaling factor for the rank-k update.
-/// \param [in, out] c Input/Output matrix C.
-/// \param [in] ldc Leading dimension of C.
-template <class T>
-inline void syrk(sycl::queue &q, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::transpose trans, int n, int k, const T *alpha,
-                  const T *a, int lda, const T *b, int ldb, const T *beta, T *c,
-                  int ldc) {
-  detail::rk_impl<false, T, T>(q, uplo, trans, n, k, alpha, a, lda, b,
-                                     ldb, beta, c, ldc);
-}
-
-/// This routines perform a special rank-k update of a Hermitian matrix C by
-/// general matrices A and B.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] uplo Specifies whether C's data is stored in its upper or lower triangle.
-/// \param [in] trans Specifies the operation to apply.
-/// \param [in] n The number of rows and columns in C.
-/// \param [in] k The inner dimension of matrix multiplications.
-/// \param [in] alpha Scaling factor for the rank-k update.
-/// \param [in] a Input matrix A.
-/// \param [in] lda Leading dimension of A.
-/// \param [in] b Input matrix B.
-/// \param [in] ldb Leading dimension of B.
-/// \param [in] beta Scaling factor for the rank-k update.
-/// \param [in, out] c Input/Output matrix C.
-/// \param [in] ldc Leading dimension of C.
-template <class T, class Tbeta>
-inline void herk(sycl::queue &q, oneapi::mkl::uplo uplo,
-                 oneapi::mkl::transpose trans, int n, int k, const T *alpha,
-                 const T *a, int lda, const T *b, int ldb, const Tbeta *beta,
-                 T *c, int ldc) {
-  detail::rk_impl<true, T, Tbeta>(q, uplo, trans, n, k, alpha, a, lda, b,
-                                        ldb, beta, c, ldc);
-}
-
-/// This routine performs a group of trsm operations. Each trsm solves an
-/// equation of the form op(A) * X = alpha * B or X * op(A) = alpha * B.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] left_right Specifies A multiplies X on the left or on the right.
-/// \param [in] upper_lower Specifies A is upper or lower triangular.
-/// \param [in] trans Specifies the operation applied to A.
-/// \param [in] unit_diag Specifies whether A is unit triangular.
-/// \param [in] m Number of rows of the B matrices.
-/// \param [in] n Number of columns of the B matrices.
-/// \param [in] alpha Scaling factor for the solutions.
-/// \param [in] a Input matrices A.
-/// \param [in] a_type Data type of the matrices A.
-/// \param [in] lda Leading dimension of the matrices A.
-/// \param [in, out] b Input and output matrices B.
-/// \param [in] b_type Data type of the matrices B.
-/// \param [in] ldb Leading dimension of the matrices B.
-/// \param [in] batch_size Specifies the number of trsm operations to perform.
-/// \param [in] scaling_type Data type of the scaling factors.
-inline void trsm_batch(sycl::queue &q, oneapi::mkl::side left_right,
-                       oneapi::mkl::uplo upper_lower,
-                       oneapi::mkl::transpose trans,
-                       oneapi::mkl::diag unit_diag, int m, int n,
-                       const void *alpha, const void **a, library_data_t a_type,
-                       int lda, void **b, library_data_t b_type, int ldb,
-                       int batch_size, library_data_t scaling_type) {
-#ifdef DPCT_USM_LEVEL_NONE
-  throw std::runtime_error("this API is unsupported when USM level is none");
-#else
-  std::uint64_t key =
-      detail::get_type_combination_id(a_type, b_type, scaling_type);
-  switch (key) {
-  case detail::get_type_combination_id(library_data_t::real_float,
-                                       library_data_t::real_float,
-                                       library_data_t::real_float): {
-    detail::trsm_batch_impl<float, float, float>(q, left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha,
-                                                 a, lda, b, ldb, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::real_double,
-                                       library_data_t::real_double,
-                                       library_data_t::real_double): {
-    detail::trsm_batch_impl<double, double, double>(
-        q, left_right, upper_lower, trans, unit_diag, m, n, alpha, a, lda, b,
-        ldb, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_float,
-                                       library_data_t::complex_float,
-                                       library_data_t::complex_float): {
-    detail::trsm_batch_impl<std::complex<float>, std::complex<float>,
-                            std::complex<float>>(q, left_right, upper_lower,
-                                                 trans, unit_diag, m, n, alpha,
-                                                 a, lda, b, ldb, batch_size);
-    break;
-  }
-  case detail::get_type_combination_id(library_data_t::complex_double,
-                                       library_data_t::complex_double,
-                                       library_data_t::complex_double): {
-    detail::trsm_batch_impl<std::complex<double>, std::complex<double>,
-                            std::complex<double>>(q, left_right, upper_lower,
-                                                  trans, unit_diag, m, n, alpha,
-                                                  a, lda, b, ldb, batch_size);
-    break;
-  }
-  default:
-    throw std::runtime_error("the combination of data type is unsupported");
-  }
-#endif
-}
-
-/// Computes a triangular matrix-general matrix product.
-/// \param [in] q The queue where the routine should be executed.
-/// \param [in] left_right Specifies A is on the left or right side of the
-/// multiplication.
-/// \param [in] upper_lower Specifies A is upper or lower triangular.
-/// \param [in] trans Specifies the operation applied to A.
-/// \param [in] unit_diag Specifies whether A is unit triangular.
-/// \param [in] m Number of rows of B.
-/// \param [in] n Number of columns of B.
-/// \param [in] alpha Scaling factor for the matrix-matrix product.
-/// \param [in] a Input matrices A.
-/// \param [in] lda Leading dimension of the matrices A.
-/// \param [in] b Input matrices B.
-/// \param [in] ldb Leading dimension of the matrices B.
-/// \param [out] c Output matrices C.
-/// \param [in] ldc Leading dimension of the matrices C.
-template <class T>
-inline void trmm(sycl::queue &q, oneapi::mkl::side left_right,
-                 oneapi::mkl::uplo upper_lower, oneapi::mkl::transpose trans,
-                 oneapi::mkl::diag unit_diag, int m, int n, const T *alpha,
-                 const T *a, int lda, const T *b, int ldb, T *c, int ldc) {
-  using Ty = typename DataType<T>::T2;
-  auto alpha_val = dpct::get_value(alpha, q);
-  if (b != c) {
-    dpct::matrix_mem_copy(c, b, ldc, ldb, m, n, dpct::device_to_device, q);
-  }
-  auto data_a = detail::get_memory<const Ty>(a);
-  auto data_c = detail::get_memory<Ty>(c);
-  oneapi::mkl::blas::column_major::trmm(q, left_right, upper_lower, trans,
-                                        unit_diag, m, n, alpha_val, data_a, lda,
-                                        data_c, ldc);
-}
-
-} // namespace dpct
-#endif // __DPCT_BLAS_UTILS_HPP__
diff --git a/dpct/ccl_utils.hpp b/dpct/ccl_utils.hpp
deleted file mode 100644
index 07b3488c937da..0000000000000
--- a/dpct/ccl_utils.hpp
+++ /dev/null
@@ -1,286 +0,0 @@
-//==---- ccl_utils.hpp----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_CCL_UTILS_HPP__
-#define __DPCT_CCL_UTILS_HPP__
-
-#include <sycl/sycl.hpp>
-#include <oneapi/ccl.hpp>
-#include <unordered_map>
-#include <memory>
-
-#include "device.hpp"
-
-namespace dpct {
-namespace ccl {
-namespace detail {
-
-/// Get stored kvs with specified kvs address.
-inline std::shared_ptr<oneapi::ccl::kvs> &
-get_kvs(const oneapi::ccl::kvs::address_type &addr) {
-  struct hash {
-    std::size_t operator()(const oneapi::ccl::kvs::address_type &in) const {
-      return std::hash<std::string_view>()(std::string_view(in.data(), in.size()));
-    }
-  };
-  static std::unordered_map<oneapi::ccl::kvs::address_type,
-                            std::shared_ptr<oneapi::ccl::kvs>, hash>
-      kvs_map;
-  return kvs_map[addr];
-}
-
-/// Help class to init ccl environment. 
-class ccl_init_helper {
-public:
-  ccl_init_helper() { oneapi::ccl::init(); }
-};
-
-} // namespace detail
-
-/// Get concatenated library version as an integer.
-static inline int get_version() {
-  oneapi::ccl::init();
-  auto ver = oneapi::ccl::get_library_version();
-  return ver.major * 10000 + ver.minor * 100 + ver.update;
-}
-
-/// Create main kvs and return its address.
-static inline oneapi::ccl::kvs::address_type create_kvs_address() {
-  oneapi::ccl::init();
-  auto ptr = oneapi::ccl::create_main_kvs();
-  auto addr = ptr->get_address();
-  detail::get_kvs(addr) = ptr;
-  return addr;
-}
-
-/// Get stored kvs with /p addr if exist. Otherwise, create kvs with /p addr.
-static inline std::shared_ptr<oneapi::ccl::kvs>
-create_kvs(const oneapi::ccl::kvs::address_type &addr) {
-  oneapi::ccl::init();
-  auto &ptr = detail::get_kvs(addr);
-  if (!ptr)
-    ptr = oneapi::ccl::create_kvs(addr);
-  return ptr;
-}
-
-/// dpct communicator extension
-class communicator_wrapper : public dpct::ccl::detail::ccl_init_helper {
-public:
-  communicator_wrapper(
-      int size, int rank, oneapi::ccl::kvs::address_type id,
-      const oneapi::ccl::comm_attr &attr = oneapi::ccl::default_comm_attr)
-      : _device_comm(oneapi::ccl::create_device(
-            static_cast<sycl::device &>(dpct::get_current_device()))),
-        _context_comm(oneapi::ccl::create_context(dpct::get_default_context())),
-        _comm(oneapi::ccl::create_communicator(
-            size, rank, _device_comm, _context_comm, dpct::ccl::create_kvs(id),
-            attr)) {
-    _queue_init = false;
-    _ccl_stream_ptr = nullptr;
-  }
-
-  ~communicator_wrapper() {
-    delete _ccl_stream_ptr;
-  };
-
-  /// Return the rank in a oneapi::ccl::communicator
-  /// \returns The rank corresponding to communicator object
-  int rank() const {
-    return _comm.rank();
-  }
-
-  /// Retrieves the number of rank in oneapi::ccl::communicator
-  /// \returns The number of the ranks
-  int size() const {
-    return _comm.size();
-  }
-
-  /// Return underlying native device, which was used in oneapi::ccl::communicator
-  sycl::device get_device() const {
-    return _comm.get_device().get_native();
-  }
-
-  /// \brief allreduce is a collective communication operation that performs the global reduction operation
-  ///       on values from all ranks of communicator and distributes the result back to all ranks.
-  /// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
-  /// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
-  /// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff
-  /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
-  /// \param rtype the type of the reduction operation to be applied
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void allreduce(const void *sendbuff, void *recvbuff, size_t count,
-                 oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
-                 sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::allreduce(sendbuff, recvbuff, count, dtype, rtype,
-                                        _comm, stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief reduce is a collective communication operation that performs the
-  ///        global reduction operation on values from all ranks of the communicator
-  ///        and returns the result to the root rank.
-  /// \param sendbuff the buffer with @c count elements of @c dtype that stores
-  ///        local data to be reduced 
-  /// \param recvbuff [out] the buffer to store reduced result, 
-  ///        must have the same dimension as @c sendbuff 
-  /// \param count the number of elements of type @c dtype in @c sendbuff and @c recvbuff 
-  /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff 
-  /// \param root the rank that gets the result of reduction 
-  /// \param rtype the type of the reduction operation to be applied 
-  /// \param queue_ptr a sycl::queue ptr associated with the operation 
-  /// \return @ref void
-  void reduce(const void *sendbuff, void *recvbuff, size_t count,
-              oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
-              int root, sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::reduce(sendbuff, recvbuff, count, dtype, rtype,
-                                     root, _comm, stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief broadcast is a collective communication operation that broadcasts data
-  ///        from one rank of communicator (denoted as root) to all other ranks.
-  ///        Only support in-place operation
-  /// \param sendbuff the buffer with @c count elements of @c dtype that stores
-  ///        local data to be reduced 
-  /// \param recvbuff [out] the buffer to store reduced result
-  /// \param count the number of elements of type @c dtype in @c buf 
-  /// \param dtype thedatatype of elements in @c buf 
-  /// \param root the rank that broadcasts @c buf
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void broadcast(void *sendbuff, void *recvbuff, size_t count,
-                 oneapi::ccl::datatype dtype, int root,
-                 sycl::queue *queue_ptr) {
-    if (sendbuff != recvbuff) {
-      throw std::runtime_error(
-          "oneCCL broadcast only support in-place operation. "
-          "sendbuff and recvbuff must be same.");
-      return;
-    }
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::broadcast(recvbuff, count, dtype, root, _comm,
-                                        stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief reduce_scatter is a collective communication operation that performs the global reduction operation
-  ///        on values from all ranks of the communicator and scatters the result in blocks back to all ranks.
-  /// \param sendbuff the buffer with @c count elements of @c dtype that stores local data to be reduced
-  /// \param recvbuff [out] the buffer to store reduced result, must have the same dimension as @c sendbuff
-  /// \param recv_count the number of elements of type @c dtype in receive block
-  /// \param dtype the datatype of elements in @c sendbuff and @c recvbuff
-  /// \param rtype the type of the reduction operation to be applied
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void reduce_scatter(const void *sendbuff, void *recvbuff, size_t recv_count,
-                      oneapi::ccl::datatype dtype, oneapi::ccl::reduction rtype,
-                      sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::reduce_scatter(sendbuff, recvbuff, recv_count,
-                                             dtype, rtype, _comm, stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief send is a pt2pt communication operation that sends data from one rank of communicator.
-  /// \param sendbuff the buffer with @c count elements of @c dtype serves as send buffer for root
-  /// \param count the number of elements of type @c dtype in @c sendbuff
-  /// \param dtype the datatype of elements in @c sendbuff
-  /// \param peer the rank that receives @c sendbuff
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void send(void *sendbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
-            sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::send(sendbuff, count, dtype, peer, _comm, stream);
-        },
-        queue_ptr);
-  }
-
-  /// \brief recv is a pt2pt communication operation that sends data from one rank of communicator.
-  /// \param recvbuff the buffer with @c count elements of @c dtype serves as  receive buffer
-  /// \param count the number of elements of type @c dtype in @c recvbuff
-  /// \param dtype the datatype of elements in @c recvbuff
-  /// \param peer the rank that receives @c recvbuff
-  /// \param queue_ptr a sycl::queue ptr associated with the operation
-  /// \return @ref void
-  void recv(void *recvbuff, size_t count, oneapi::ccl::datatype dtype, int peer,
-            sycl::queue *queue_ptr) {
-    call_func_wrapper(
-        [=](const oneapi::ccl::stream &stream) {
-          return oneapi::ccl::recv(recvbuff, count, dtype, peer, _comm, stream);
-        },
-        queue_ptr);
-  }
-
-private:
-  oneapi::ccl::device _device_comm;
-  oneapi::ccl::context _context_comm;
-  oneapi::ccl::communicator _comm;
-  sycl::queue _queue;
-  bool _queue_init;
-  oneapi::ccl::stream *_ccl_stream_ptr;
-
-  template <class Fn>
-  void call_func_wrapper(Fn func, sycl::queue *qptr) {
-    if (_queue_init && *qptr != _queue) {
-      call_func_async(func, qptr);
-    } else {
-      if(!_queue_init) {
-        _queue = *qptr;
-        _queue_init = true;
-        _ccl_stream_ptr = new oneapi::ccl::stream(oneapi::ccl::create_stream(_queue));
-      }
-      std::invoke(func, *_ccl_stream_ptr);
-    }
-  }
-
-  class call_func_async {
-    sycl::queue *_q_ptr;
-    struct call_async_impl {
-      oneapi::ccl::stream _ccl_stream_impl;
-      oneapi::ccl::event _ccl_event_impl;
-      template <class Fn>
-      explicit call_async_impl(Fn func, sycl::queue *qptr)
-          : _ccl_stream_impl(oneapi::ccl::create_stream(*qptr)),
-            _ccl_event_impl(std::invoke(func, _ccl_stream_impl)) {}
-    };
-    call_async_impl *_imp;
-
-  public:
-    template <class Fn>
-    explicit call_func_async(Fn func, sycl::queue *qptr)
-        : _q_ptr(qptr),
-          _imp(new call_async_impl(func, qptr)) {}
-    ~call_func_async() {
-      _q_ptr->submit([&](sycl::handler &cgh)
-                     { cgh.host_task([=]
-                                     {
-        _imp->_ccl_event_impl.wait();
-        delete _imp; }); });
-    }
-  };
-};
-
-typedef dpct::ccl::communicator_wrapper *comm_ptr;
-
-} // namespace ccl
-} // namespace dpct
-
-#endif // __DPCT_CCL_UTILS_HPP__
\ No newline at end of file
diff --git a/dpct/device.hpp b/dpct/device.hpp
deleted file mode 100644
index 729ebf625a472..0000000000000
--- a/dpct/device.hpp
+++ /dev/null
@@ -1,781 +0,0 @@
-//==---- device.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_DEVICE_HPP__
-#define __DPCT_DEVICE_HPP__
-
-#include <sycl/sycl.hpp>
-#include <algorithm>
-#include <array>
-#include <cstring>
-#include <iostream>
-#include <mutex>
-#include <set>
-#include <sstream>
-#include <map>
-#include <vector>
-#include <thread>
-#if defined(__linux__)
-#include <unistd.h>
-#include <sys/syscall.h>
-#endif
-#if defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-namespace dpct {
-namespace detail {
-static void get_version(const sycl::device &dev, int &major, int &minor) {
-  // Version string has the following format:
-  // a. OpenCL<space><major.minor><space><vendor-specific-information>
-  // b. <major.minor>
-  std::string ver;
-  ver = dev.get_info<sycl::info::device::version>();
-  std::string::size_type i = 0;
-  while (i < ver.size()) {
-    if (isdigit(ver[i]))
-      break;
-    i++;
-  }
-  major = std::stoi(&(ver[i]));
-  while (i < ver.size()) {
-    if (ver[i] == '.')
-      break;
-    i++;
-  }
-  i++;
-  minor = std::stoi(&(ver[i]));
-}
-} // namespace detail
-
-/// SYCL default exception handler
-inline auto exception_handler = [](sycl::exception_list exceptions) {
-  for (std::exception_ptr const &e : exceptions) {
-    try {
-      std::rethrow_exception(e);
-    } catch (sycl::exception const &e) {
-      std::cerr << "Caught asynchronous SYCL exception:" << std::endl
-                << e.what() << std::endl
-                << "Exception caught at file:" << __FILE__
-                << ", line:" << __LINE__ << std::endl;
-    }
-  }
-};
-
-typedef sycl::event *event_ptr;
-
-typedef sycl::queue *queue_ptr;
-
-typedef char *device_ptr;
-
-/// Destroy \p event pointed memory.
-///
-/// \param event Pointer to the sycl::event address.
-static void destroy_event(event_ptr event) {
-    delete event;
-}
-
-class device_info {
-public:
-  // get interface
-  const char *get_name() const { return _name; }
-  char *get_name() { return _name; }
-  template <typename WorkItemSizesTy = sycl::range<3>,
-            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                 std::is_same_v<WorkItemSizesTy, int *>,
-                             int> = 0>
-  auto get_max_work_item_sizes() const {
-    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-      return sycl::range<3>(_max_work_item_sizes_i[0],
-                            _max_work_item_sizes_i[1],
-                            _max_work_item_sizes_i[2]);
-    else {
-      return _max_work_item_sizes_i;
-    }  
-  }
-  template <typename WorkItemSizesTy = sycl::range<3>,
-            std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                 std::is_same_v<WorkItemSizesTy, int *>,
-                             int> = 0>
-  auto get_max_work_item_sizes() {
-    if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-      return sycl::range<3>(_max_work_item_sizes_i[0],
-                            _max_work_item_sizes_i[1],
-                            _max_work_item_sizes_i[2]);
-    else {
-      return _max_work_item_sizes_i;
-    }  
-  }
-  bool get_host_unified_memory() const { return _host_unified_memory; }
-  int get_major_version() const { return _major; }
-  int get_minor_version() const { return _minor; }
-  int get_integrated() const { return _integrated; }
-  int get_max_clock_frequency() const { return _frequency; }
-  int get_max_compute_units() const { return _max_compute_units; }
-  int get_max_work_group_size() const { return _max_work_group_size; }
-  int get_max_sub_group_size() const { return _max_sub_group_size; }
-  int get_max_work_items_per_compute_unit() const {
-    return _max_work_items_per_compute_unit;
-  }
-  int get_max_register_size_per_work_group() const {
-    return _max_register_size_per_work_group;
-  }
-  template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                 std::is_same_v<NDRangeSizeTy, int *>,
-                             int> = 0>
-  auto get_max_nd_range_size() const {
-    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-      return _max_nd_range_size;
-    else
-      return _max_nd_range_size_i;
-  }
-  template <typename NDRangeSizeTy = size_t *,
-            std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                 std::is_same_v<NDRangeSizeTy, int *>,
-                             int> = 0>
-  auto get_max_nd_range_size() {
-    if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-      return _max_nd_range_size;
-    else
-      return _max_nd_range_size_i;
-  }
-  size_t get_global_mem_size() const { return _global_mem_size; }
-  size_t get_local_mem_size() const { return _local_mem_size; }
-  /// Returns the maximum clock rate of device's global memory in kHz. If
-  /// compiler does not support this API then returns default value 3200000 kHz.
-  unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
-  /// Returns the maximum bus width between device and memory in bits. If
-  /// compiler does not support this API then returns default value 64 bits.
-  unsigned int get_memory_bus_width() const { return _memory_bus_width; }
-  uint32_t get_device_id() const { return _device_id; }
-  std::array<unsigned char, 16> get_uuid() const { return _uuid; }
-  /// Returns global memory cache size in bytes.
-  unsigned int get_global_mem_cache_size() const {
-    return _global_mem_cache_size;
-  }
-
-  // set interface
-  void set_name(const char* name) {
-    size_t length = strlen(name);
-    if (length < 256) {
-      std::memcpy(_name, name, length + 1);
-    } else {
-      std::memcpy(_name, name, 255);
-      _name[255] = '\0';
-    }
-  }
-  void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes) {
-    for (int i = 0; i < 3; ++i)
-      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-  }
-  [[deprecated]] void
-  set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes) {
-    for (int i = 0; i < 3; ++i) {
-      _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-    }
-  }
-  void set_host_unified_memory(bool host_unified_memory) {
-    _host_unified_memory = host_unified_memory;
-  }
-  void set_major_version(int major) { _major = major; }
-  void set_minor_version(int minor) { _minor = minor; }
-  void set_integrated(int integrated) { _integrated = integrated; }
-  void set_max_clock_frequency(int frequency) { _frequency = frequency; }
-  void set_max_compute_units(int max_compute_units) {
-    _max_compute_units = max_compute_units;
-  }
-  void set_global_mem_size(size_t global_mem_size) {
-    _global_mem_size = global_mem_size;
-  }
-  void set_local_mem_size(size_t local_mem_size) {
-    _local_mem_size = local_mem_size;
-  }
-  void set_max_work_group_size(int max_work_group_size) {
-    _max_work_group_size = max_work_group_size;
-  }
-  void set_max_sub_group_size(int max_sub_group_size) {
-    _max_sub_group_size = max_sub_group_size;
-  }
-  void
-  set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit) {
-    _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
-  }
-  void set_max_nd_range_size(int max_nd_range_size[]) {
-    for (int i = 0; i < 3; i++) {
-      _max_nd_range_size[i] = max_nd_range_size[i];
-      _max_nd_range_size_i[i] = max_nd_range_size[i];
-    }
-  }
-  void set_memory_clock_rate(unsigned int memory_clock_rate) {
-    _memory_clock_rate = memory_clock_rate;
-  }
-  void set_memory_bus_width(unsigned int memory_bus_width) {
-    _memory_bus_width = memory_bus_width;
-  }
-  void
-  set_max_register_size_per_work_group(int max_register_size_per_work_group) {
-    _max_register_size_per_work_group = max_register_size_per_work_group;
-  }
-  void set_device_id(uint32_t device_id) {
-    _device_id = device_id;
-  }
-  void set_uuid(std::array<unsigned char, 16> uuid) {
-    _uuid = std::move(uuid);
-  }
-  void set_global_mem_cache_size(unsigned int global_mem_cache_size) {
-    _global_mem_cache_size = global_mem_cache_size;
-  }
-
-private:
-  char _name[256];
-  int _max_work_item_sizes_i[3];
-  bool _host_unified_memory = false;
-  int _major;
-  int _minor;
-  int _integrated = 0;
-  int _frequency;
-  // Set estimated value 3200000 kHz as default value.
-  unsigned int _memory_clock_rate = 3200000;
-  // Set estimated value 64 bits as default value.
-  unsigned int _memory_bus_width = 64;
-  unsigned int _global_mem_cache_size;
-  int _max_compute_units;
-  int _max_work_group_size;
-  int _max_sub_group_size;
-  int _max_work_items_per_compute_unit;
-  int _max_register_size_per_work_group;
-  size_t _global_mem_size;
-  size_t _local_mem_size;
-  size_t _max_nd_range_size[3];
-  int _max_nd_range_size_i[3];
-  uint32_t _device_id;
-  std::array<unsigned char, 16> _uuid;
-};
-
-static int get_major_version(const sycl::device &dev) {
-  int major, minor;
-  detail::get_version(dev, major, minor);
-  return major;
-}
-
-static int get_minor_version(const sycl::device &dev) {
-  int major, minor;
-  detail::get_version(dev, major, minor);
-  return minor;
-}
-
-static void get_device_info(device_info &out, const sycl::device &dev) {
-  device_info prop;
-  prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
-
-  int major, minor;
-  detail::get_version(dev, major, minor);
-  prop.set_major_version(major);
-  prop.set_minor_version(minor);
-
-  prop.set_max_work_item_sizes(
-#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
-      // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
-      // is an enum class element
-      dev.get_info<sycl::info::device::max_work_item_sizes>());
-#else
-      // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
-      // an int
-      dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
-#endif
-  prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
-
-  prop.set_max_clock_frequency(
-      dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
-
-  prop.set_max_compute_units(
-      dev.get_info<sycl::info::device::max_compute_units>());
-  prop.set_max_work_group_size(
-      dev.get_info<sycl::info::device::max_work_group_size>());
-  prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
-  prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
-
-#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
-  if (dev.has(sycl::aspect::ext_intel_memory_clock_rate)) {
-    unsigned int tmp =
-        dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
-    if (tmp != 0)
-      prop.set_memory_clock_rate(1000 * tmp);
-  }
-  if (dev.has(sycl::aspect::ext_intel_memory_bus_width)) {
-    prop.set_memory_bus_width(
-        dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
-  }
-  if (dev.has(sycl::aspect::ext_intel_device_id)) {
-    prop.set_device_id(
-        dev.get_info<sycl::ext::intel::info::device::device_id>());
-  }
-  if (dev.has(sycl::aspect::ext_intel_device_info_uuid)) {
-    prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
-  }
-#elif defined(_MSC_VER) && !defined(__clang__)
-#pragma message("get_device_info: querying memory_clock_rate and \
-memory_bus_width are not supported by the compiler used. \
-Use 3200000 kHz as memory_clock_rate default value. \
-Use 64 bits as memory_bus_width default value.")
-#else
-#warning "get_device_info: querying memory_clock_rate and \
-memory_bus_width are not supported by the compiler used. \
-Use 3200000 kHz as memory_clock_rate default value. \
-Use 64 bits as memory_bus_width default value."
-#endif
-
-  size_t max_sub_group_size = 1;
-  std::vector<size_t> sub_group_sizes =
-      dev.get_info<sycl::info::device::sub_group_sizes>();
-
-  for (const auto &sub_group_size : sub_group_sizes) {
-    if (max_sub_group_size < sub_group_size)
-      max_sub_group_size = sub_group_size;
-  }
-
-  prop.set_max_sub_group_size(max_sub_group_size);
-
-  prop.set_max_work_items_per_compute_unit(
-      dev.get_info<sycl::info::device::max_work_group_size>());
-  int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
-  prop.set_max_nd_range_size(max_nd_range_size);
-
-  // Estimates max register size per work group, feel free to update the value
-  // according to device properties.
-  prop.set_max_register_size_per_work_group(65536);
-
-  prop.set_global_mem_cache_size(
-      dev.get_info<sycl::info::device::global_mem_cache_size>());
-  out = prop;
-}
-
-/// dpct device extension
-class device_ext : public sycl::device {
-  typedef std::mutex mutex_type;
-
-public:
-  device_ext() : sycl::device(), _ctx(*this) {}
-  ~device_ext() {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    clear_queues();
-  }
-  device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    init_queues();
-  }
-
-  int is_native_atomic_supported() { return 0; }
-  int get_major_version() const {
-    return dpct::get_major_version(*this);
-  }
-
-  int get_minor_version() const {
-    return dpct::get_minor_version(*this);
-  }
-
-  int get_max_compute_units() const {
-    return get_device_info().get_max_compute_units();
-  }
-
-  /// Return the maximum clock frequency of this device in KHz.
-  int get_max_clock_frequency() const {
-    return get_device_info().get_max_clock_frequency();
-  }
-
-  int get_integrated() const { return get_device_info().get_integrated(); }
-
-  int get_max_sub_group_size() const {
-    return get_device_info().get_max_sub_group_size();
-  }
-
-  int get_max_register_size_per_work_group() const {
-    return get_device_info().get_max_register_size_per_work_group();
-  }
-
-  int get_max_work_group_size() const {
-    return get_device_info().get_max_work_group_size();
-  }
-
-  int get_mem_base_addr_align() const {
-    return get_info<sycl::info::device::mem_base_addr_align>();
-  }
-
-  size_t get_global_mem_size() const {
-    return get_device_info().get_global_mem_size();
-  }
-
-  /// Get the number of bytes of free and total memory on the SYCL device.
-  /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
-  /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
-  void get_memory_info(size_t &free_memory, size_t &total_memory) {
-#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
-    if (!has(sycl::aspect::ext_intel_free_memory)) {
-      std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-      free_memory = 0;
-    } else {
-      free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
-    }
-#else
-    std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-    free_memory = 0;
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma message("Querying the number of bytes of free memory is not supported")
-#else
-#warning "Querying the number of bytes of free memory is not supported"
-#endif
-#endif
-    total_memory = get_device_info().get_global_mem_size();
-  }
-
-  void get_device_info(device_info &out) const {
-    dpct::get_device_info(out, *this);
-  }
-
-  device_info get_device_info() const {
-    device_info prop;
-    dpct::get_device_info(prop, *this);
-    return prop;
-  }
-
-  void reset() {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    clear_queues();
-    init_queues();
-  }
-
-  sycl::queue &in_order_queue() { return *_q_in_order; }
-
-  sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
-
-  sycl::queue &default_queue() {
-#ifdef DPCT_USM_LEVEL_NONE
-    return out_of_order_queue();
-#else
-    return in_order_queue();
-#endif // DPCT_USM_LEVEL_NONE
-  }
-
-  void queues_wait_and_throw() {
-    std::unique_lock<mutex_type> lock(m_mutex);
-    std::vector<std::shared_ptr<sycl::queue>> current_queues(
-        _queues);
-    lock.unlock();
-    for (const auto &q : current_queues) {
-      q->wait_and_throw();
-    }
-    // Guard the destruct of current_queues to make sure the ref count is safe.
-    lock.lock();
-  }
-
-  sycl::queue *create_queue(bool enable_exception_handler = false) {
-#ifdef DPCT_USM_LEVEL_NONE
-    return create_out_of_order_queue(enable_exception_handler);
-#else
-    return create_in_order_queue(enable_exception_handler);
-#endif // DPCT_USM_LEVEL_NONE
-  }
-
-  sycl::queue *create_in_order_queue(bool enable_exception_handler = false) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    return create_queue_impl(enable_exception_handler,
-                             sycl::property::queue::in_order());
-  }
-
-  sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    return create_queue_impl(enable_exception_handler);
-  }
-
-  void destroy_queue(sycl::queue *&queue) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
-                                  [=](const std::shared_ptr<sycl::queue> &q) -> bool {
-                                    return q.get() == queue;
-                                  }),
-                   _queues.end());
-    queue = nullptr;
-  }
-  void set_saved_queue(sycl::queue* q) {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    _saved_queue = q;
-  }
-  sycl::queue *get_saved_queue() const {
-    std::lock_guard<mutex_type> lock(m_mutex);
-    return _saved_queue;
-  }
-  sycl::context get_context() const { return _ctx; }
-
-private:
-  void clear_queues() {
-    _queues.clear();
-    _q_in_order = _q_out_of_order = _saved_queue = nullptr;
-  }
-
-  void init_queues() {
-    _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
-    _q_out_of_order = create_queue_impl(true);
-    _saved_queue = &default_queue();
-  }
-
-  /// Caller should acquire resource \p m_mutex before calling this function.
-  template <class... Properties>
-  sycl::queue *create_queue_impl(bool enable_exception_handler,
-                                 Properties... properties) {
-    sycl::async_handler eh = {};
-    if (enable_exception_handler) {
-      eh = exception_handler;
-    }
-    _queues.push_back(std::make_shared<sycl::queue>(
-        _ctx, *this, eh,
-        sycl::property_list(
-#ifdef DPCT_PROFILING_ENABLED
-            sycl::property::queue::enable_profiling(),
-#endif
-            properties...)));
-
-    return _queues.back().get();
-  }
-
-  void get_version(int &major, int &minor) const {
-    detail::get_version(*this, major, minor);
-  }
-  sycl::queue *_q_in_order, *_q_out_of_order;
-  sycl::queue *_saved_queue;
-  sycl::context _ctx;
-  std::vector<std::shared_ptr<sycl::queue>> _queues;
-  mutable mutex_type m_mutex;
-};
-
-static inline unsigned int get_tid() {
-#if defined(__linux__)
-  return syscall(SYS_gettid);
-#elif defined(_WIN64)
-  return GetCurrentThreadId();
-#else
-#error "Only support Windows and Linux."
-#endif
-}
-
-/// device manager
-class dev_mgr {
-public:
-  device_ext &current_device() {
-    unsigned int dev_id=current_device_id();
-    check_id(dev_id);
-    return *_devs[dev_id];
-  }
-  device_ext &cpu_device() const {
-    std::lock_guard<std::recursive_mutex> lock(m_mutex);
-    if (_cpu_device == -1) {
-      throw std::runtime_error("no valid cpu device");
-    } else {
-      return *_devs[_cpu_device];
-    }
-  }
-  device_ext &get_device(unsigned int id) const {
-    std::lock_guard<std::recursive_mutex> lock(m_mutex);
-    check_id(id);
-    return *_devs[id];
-  }
-  unsigned int current_device_id() const {
-   std::lock_guard<std::recursive_mutex> lock(m_mutex);
-   auto it=_thread2dev_map.find(get_tid());
-   if(it != _thread2dev_map.end())
-      return it->second;
-    return DEFAULT_DEVICE_ID;
-  }
-
-/// Select device with a device ID.
-/// \param [in] id The id of the device which can
-/// be obtained through get_device_id(const sycl::device).
-  void select_device(unsigned int id) {
-    std::lock_guard<std::recursive_mutex> lock(m_mutex);
-    check_id(id);
-    _thread2dev_map[get_tid()]=id;
-  }
-  unsigned int device_count() { return _devs.size(); }
-
-  unsigned int get_device_id(const sycl::device &dev) {
-    unsigned int id = 0;
-    for(auto dev_item : _devs) {
-      if (*dev_item == dev) {
-        break;
-      }
-      id++;
-    }
-    return id;
-  }
-
-  template <class DeviceSelector>
-  std::enable_if_t<
-      std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-  select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
-    sycl::device selected_device = sycl::device(selector);
-    unsigned int selected_device_id = get_device_id(selected_device);
-    select_device(selected_device_id);
-  }
-
-  /// Returns the instance of device manager singleton.
-  static dev_mgr &instance() {
-    static dev_mgr d_m;
-    return d_m;
-  }
-  dev_mgr(const dev_mgr &) = delete;
-  dev_mgr &operator=(const dev_mgr &) = delete;
-  dev_mgr(dev_mgr &&) = delete;
-  dev_mgr &operator=(dev_mgr &&) = delete;
-
-private:
-  mutable std::recursive_mutex m_mutex;
-  dev_mgr() {
-    sycl::device default_device =
-        sycl::device(sycl::default_selector_v);
-    _devs.push_back(std::make_shared<device_ext>(default_device));
-
-    std::vector<sycl::device> sycl_all_devs =
-        sycl::device::get_devices(sycl::info::device_type::all);
-    // Collect other devices except for the default device.
-    if (default_device.is_cpu())
-      _cpu_device = 0;
-    for (auto &dev : sycl_all_devs) {
-      if (dev == default_device) {
-        continue;
-      }
-      _devs.push_back(std::make_shared<device_ext>(dev));
-      if (_cpu_device == -1 && dev.is_cpu()) {
-        _cpu_device = _devs.size() - 1;
-      }
-    }
-  }
-  void check_id(unsigned int id) const {
-    if (id >= _devs.size()) {
-      throw std::runtime_error("invalid device id");
-    }
-  }
-  std::vector<std::shared_ptr<device_ext>> _devs;
-  /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
-  /// thread id in _thread2dev_map, which means default device should be used
-  /// for the current thread.
-  const unsigned int DEFAULT_DEVICE_ID = 0;
-  /// thread-id to device-id map.
-  std::map<unsigned int, unsigned int> _thread2dev_map;
-  int _cpu_device = -1;
-};
-
-/// Util function to get the default queue of current selected device depends on
-/// the USM config. Return the default out-of-ordered queue when USM-none is
-/// enabled, otherwise return the default in-ordered queue.
-static inline sycl::queue &get_default_queue() {
-  return dev_mgr::instance().current_device().default_queue();
-}
-
-/// Util function to get the default in-ordered queue of current device in
-/// dpct device manager.
-static inline sycl::queue &get_in_order_queue() {
-  return dev_mgr::instance().current_device().in_order_queue();
-}
-
-/// Util function to get the default out-of-ordered queue of current device in
-/// dpct device manager.
-static inline sycl::queue &get_out_of_order_queue() {
-  return dev_mgr::instance().current_device().out_of_order_queue();
-}
-
-/// Util function to get the id of current device in
-/// dpct device manager.
-static inline unsigned int get_current_device_id() {
-  return dev_mgr::instance().current_device_id();
-}
-
-/// Util function to get the current device.
-static inline device_ext &get_current_device() {
-  return dev_mgr::instance().current_device();
-}
-
-/// Util function to get a device by id.
-static inline device_ext &get_device(unsigned int id) {
-  return dev_mgr::instance().get_device(id);
-}
-
-/// Util function to get the context of the default queue of current
-/// device in dpct device manager.
-static inline sycl::context get_default_context() {
-  return dpct::get_current_device().get_context();
-}
-
-/// Util function to get a CPU device.
-static inline device_ext &cpu_device() {
-  return dev_mgr::instance().cpu_device();
-}
-
-static inline unsigned int select_device(unsigned int id) {
-  dev_mgr::instance().select_device(id);
-  return id;
-}
-
-template <class DeviceSelector>
-static inline std::enable_if_t<
-    std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-select_device(const DeviceSelector &selector = sycl::gpu_selector_v) {
-  dev_mgr::instance().select_device(selector);
-}
-
-static inline unsigned int get_device_id(const sycl::device &dev){
-  return dev_mgr::instance().get_device_id(dev);
-}
-
-/// Util function to check whether a device supports some kinds of sycl::aspect.
-inline void
-has_capability_or_fail(const sycl::device &dev,
-                       const std::initializer_list<sycl::aspect> &props) {
-  for (const auto &it : props) {
-    if (dev.has(it))
-      continue;
-    switch (it) {
-    case sycl::aspect::fp64:
-      throw std::runtime_error("'double' is not supported in '" +
-                               dev.get_info<sycl::info::device::name>() +
-                               "' device");
-      break;
-    case sycl::aspect::fp16:
-      throw std::runtime_error("'half' is not supported in '" +
-                               dev.get_info<sycl::info::device::name>() +
-                               "' device");
-      break;
-    default:
-#define __SYCL_ASPECT(ASPECT, ID)                                              \
-  case sycl::aspect::ASPECT:                                                   \
-    return #ASPECT;
-#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
-#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
-      auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string {
-        switch (AspectNum) {
-#include <sycl/info/aspects.def>
-#include <sycl/info/aspects_deprecated.def>
-        default:
-          return "unknown aspect";
-        }
-      };
-#undef __SYCL_ASPECT_DEPRECATED_ALIAS
-#undef __SYCL_ASPECT_DEPRECATED
-#undef __SYCL_ASPECT
-      throw std::runtime_error(
-          "'" + getAspectNameStr(it) + "' is not supported in '" +
-          dev.get_info<sycl::info::device::name>() + "' device");
-    }
-    break;
-  }
-}
-} // namespace dpct
-
-#endif // __DPCT_DEVICE_HPP__
diff --git a/dpct/dnnl_utils.hpp b/dpct/dnnl_utils.hpp
deleted file mode 100644
index caf5a768b77e2..0000000000000
--- a/dpct/dnnl_utils.hpp
+++ /dev/null
@@ -1,4921 +0,0 @@
-//==---- dnnl_utils.hpp ---------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_DNNL_UTILS_HPP__
-#define __DPCT_DNNL_UTILS_HPP__
-
-#include <oneapi/dpl/algorithm>
-#include <oneapi/dpl/execution>
-#include <oneapi/dpl/numeric>
-#include <oneapi/mkl.hpp>
-#include <oneapi/mkl/rng/device.hpp>
-#include <sycl/sycl.hpp>
-#include <oneapi/dnnl/dnnl.hpp>
-#include <oneapi/dnnl/dnnl_sycl.hpp>
-#include <unordered_map>
-#include <algorithm>
-#include <list>
-
-#include "memory.hpp"
-#include "device.hpp"
-#include "lib_common_utils.hpp"
-
-namespace dpct {
-namespace dnnl {
-/// Get concatenated library version as an integer.
-static inline size_t get_version() {
-  const ::dnnl::version_t *ver = ::dnnl::version();
-  return ver->major * 1000 + ver->minor * 100 + ver->patch;
-}
-class engine_ext;
-typedef oneapi::mkl::rng::philox4x32x10 rng_engine_t;
-/// An enum class representing memory layout. Used by
-/// memory_desc_ext to create a memory with pre-defined layout.
-enum class memory_format_tag { nchw, nhwc, nchw_blocked };
-
-/// An enum class representing RNN data memory layout. Used by
-/// memory_desc_ext to create a memory with pre-defined layout.
-enum class rnn_memory_format_tag { tnc, ntc };
-
-/// A class holding the description of an N-dimensions memory.
-class memory_desc_ext {
-  ::dnnl::memory::desc _desc;
-public:
-  /// Convert dpct::library_data_t to dnnl::memory::data_type.
-  static ::dnnl::memory::data_type to_dnnl_data_type(dpct::library_data_t dt);
-  /// Convert dnnl::memory::data_type to dpct::library_data_t.
-  static dpct::library_data_t
-  to_dpct_library_data_t(::dnnl::memory::data_type dt, unsigned block_size);
-  /// Convert dpct::dnnl::memory_format_tag to dnnl::memory::format_tag.
-  static ::dnnl::memory::format_tag to_dnnl_format_tag(dpct::library_data_t dt,
-                                                       memory_format_tag tag);
-  memory_desc_ext() = default;
-  memory_desc_ext(::dnnl::memory::desc &desc) : _desc(desc) {}
-  memory_desc_ext(::dnnl::memory::desc &&desc) : _desc(std::move(desc)) {}
-  /// Setting a 4D memory with given parameters.
-  /// \param [in] tag Format tag.
-  /// \param [in] dt Data type.
-  /// \param [in] n Number of images.
-  /// \param [in] c Number of channels.
-  /// \param [in] h Height of images.
-  /// \param [in] w Width of images.
-  void set(memory_format_tag tag, dpct::library_data_t dt, int n, int c, int h,
-           int w);
-  /// Setting a 3D RNN data memory with given parameters.
-  /// \param [in] tag RNN data format tag.
-  /// \param [in] dt Data type.
-  /// \param [in] t Number of sequence length.
-  /// \param [in] n Number of batch.
-  /// \param [in] c Height of input channel.
-  void set(rnn_memory_format_tag tag, dpct::library_data_t dt, int t, int n, int c);
-  /// Setting a 4D memory with given parameters.
-  /// \param [in] dt Data type.
-  /// \param [in] n Number of images.
-  /// \param [in] c Number of channels.
-  /// \param [in] h Height of images.
-  /// \param [in] w Width of images.
-  /// \param [in] n_stride Stride between two continuous images.
-  /// \param [in] c_stride Stride between two continuous channels.
-  /// \param [in] h_stride Stride between two continuous rows.
-  /// \param [in] w_stride Stride between two continuous columns.
-  void set(dpct::library_data_t dt, int n, int c, int h, int w, int n_stride,
-           int c_stride, int h_stride, int w_stride);
-  /// Setting a ND memory with given parameters.
-  /// \param [in] dt Data type.
-  /// \param [in] ndims Dimension of the memory.
-  /// \param [in] dims Array of dimension ndims that contain the size of each
-  /// memory dimension. \param [in] strides Array of dimension ndims that
-  /// contain the stride of each memory dimension.
-  void set(dpct::library_data_t dt, int ndims, const int dims[],
-           const int strides[]);
-  /// Setting a ND memory with given parameters.
-  /// \param [in] tag Format tag.
-  /// \param [in] dt Data type.
-  /// \param [in] ndims Dimension of the memory.
-  /// \param [in] dims Array of dimension ndims that contain the size of each
-  /// memory dimension.
-  void set(memory_format_tag tag, dpct::library_data_t dt, int ndims,
-           const int dims[]);
-  /// Getting a ::dnnl::memory::desc from a memory_desc_ext.
-  /// \returns The ::dnnl::memory::desc.
-  const ::dnnl::memory::desc &get_desc() const { return _desc; }
-  /// Setting holding desc with given dnnl memory descriptor.
-  void set_desc(::dnnl::memory::desc desc) { _desc = desc; }
-  /// Getting a size of a memory_desc_ext in bytes.
-  /// \returns The size.
-  size_t get_size() const { return _desc.get_size(); }
-  /// Getting parameters from a 4D memory.
-  /// \param [out] dt Data type.
-  /// \param [out] n Number of images.
-  /// \param [out] c Number of channels.
-  /// \param [out] h Height of images.
-  /// \param [out] w Width of images.
-  /// \param [out] n_stride Stride between two continuous images.
-  /// \param [out] c_stride Stride between two continuous channels.
-  /// \param [out] h_stride Stride between two continuous rows.
-  /// \param [out] w_stride Stride between two continuous columns.
-  void get(dpct::library_data_t *dt, int *n, int *c, int *h, int *w,
-           int *n_stride, int *c_stride, int *h_stride, int *w_stride) const;
-  /// Getting parameters from a 4D memory.
-  /// \param [out] dt Data type.
-  /// \param [out] tag Format tag.
-  /// \param [out] n Number of images.
-  /// \param [out] c Number of channels.
-  /// \param [out] h Height of images.
-  /// \param [out] w Width of images.
-  void get(dpct::library_data_t *dt, memory_format_tag *tag, int *n, int *c,
-           int *h, int *w) const;
-  /// Getting parameters from a 3D RNN data memory.
-  /// \param [out] dt Data type.
-  /// \param [out] tag RNN data format tag.
-  /// \param [out] t Number of sequence length.
-  /// \param [out] n Number of batch.
-  /// \param [out] c Height of input channel.
-  void get(dpct::library_data_t *dt, rnn_memory_format_tag *tag, int *t, int *n,
-           int *c) const;
-  /// Getting parameters from a ND memory.
-  /// \param [in] requested_ndims Requested number of dimensions to get from a
-  /// given memory descriptor.
-  /// \param [out] dt Data type.
-  /// \param [out] ndims Dimension of the memory.
-  /// \param [out] dims Array of dimension requested_ndims that contain the 
-  /// size of each memory dimension.
-  /// \param [out] strides Array of dimension requested_ndims that contain the
-  /// stride of each memory dimension.
-  void get(int requested_ndims, dpct::library_data_t *dt, int *ndims,
-           int dims[], int strides[]) const;
-  /// Getting parameters from a ND memory.
-  /// \param [in] requested_ndims Requested number of dimensions to get from a
-  /// given memory descriptor.
-  /// \param [out] dt Data type.
-  /// \param [out] tag Format tag.
-  /// \param [out] ndims Dimension of the memory.
-  /// \param [out] dims Array of dimension requested_ndims that contain the 
-  /// size of each memory dimension.
-  void get(int requested_ndims, dpct::library_data_t *dt,
-           memory_format_tag *tag, int *ndims, int dims[]) const;
-  /// Getting dims from a ND memory.
-  /// \return The dims.
-  std::vector<int64_t> get_dims() const { return _desc.get_dims(); }
-  /// Getting strides from a ND memory.
-  /// \return The strides.
-  std::vector<int64_t> get_strides() const {
-    return _desc.get_strides();
-  }
-  /// Getting element num from a ND memory.
-  /// \return The element number.
-  size_t get_element_num() const {
-    auto dims = _desc.get_dims();
-    if (dims.empty()) {
-      return 0;
-    }
-    size_t result = 1;
-    for (auto &dim : dims) {
-      result *= dim;
-    }
-    return result;
-  }
-
-  operator bool() const {
-    return bool(_desc);
-  }
-
-  memory_desc_ext &operator=(std::nullptr_t) {
-    _desc.reset(nullptr);
-    return *this;
-  }
-};
-
-/// A class holding description for an activation operation.
-class activation_desc {
-  ::dnnl::algorithm _alg;
-  float _alpha;
-  float _beta;
-
-public:
-  /// Setting an activation descriptor with given parameters.
-  /// \param [in] alg Activation algorithm.
-  /// \param [in] alpha Value of alpha parameter.
-  void set(::dnnl::algorithm alg, float alpha) {
-    _alg = alg;
-    if(alg == ::dnnl::algorithm::eltwise_clip) {
-      _alpha = 0;
-      _beta = alpha;
-    } else {
-      _alpha = alpha;
-    }
-  }
-  /// Getting parameters form an activation descriptor.
-  /// \param [out] alg Activation algorithm.
-  /// \param [out] alpha Value of alpha parameter.
-  void get(::dnnl::algorithm *alg, float *alpha) const {
-    *alg = _alg;
-    if(_alg == ::dnnl::algorithm::eltwise_clip) {
-      *alpha = _beta;
-    } else {
-      *alpha = _alpha;
-    }
-  }
-  /// Setting the alpha parameter of an activation descriptor.
-  /// \param [in] alpha Value of alpha parameter.
-  void set_alpha(float alpha) { _alpha = alpha; }
-  /// Setting the beta parameter of an activation descriptor.
-  /// \param [in] beta Value of beta parameter.
-  void set_beta(float beta) { _beta = beta; }
-  /// Setting the algorithm parameter of an activation descriptor.
-  /// \param [in] alg Activation algorithm.
-  void set_algorithm(::dnnl::algorithm alg) { _alg = alg; }
-  /// Getting the alpha parameter from an activation descriptor.
-  /// \param [out] alpha Value of alpha parameter.
-  float get_alpha() const { return _alpha; }
-  /// Getting the beta parameter from an activation descriptor.
-  /// \param [out] beta Value of beta parameter.
-  float get_beta() const { return _beta; }
-  /// Getting the algorithm parameter from an activation descriptor.
-  /// \param [out] alg Activation algorithm.
-  ::dnnl::algorithm get_algorithm() const { return _alg; }
-};
-
-/// A class holding description for a local response normalization operation.
-class lrn_desc {
-  unsigned int _local_size;
-  float _alpha;
-  float _beta;
-  float _k;
-
-public:
-  /// Setting a local response normalization descriptor with given parameters.
-  /// \param [in] local_size Value of local_size parameter.
-  /// \param [in] alpha Value of alpha parameter.
-  /// \param [in] beta Value of beta parameter.
-  /// \param [in] k Value of k parameter.
-  void set(unsigned int local_size, float alpha, float beta, float k) {
-    _local_size = local_size;
-    _alpha = alpha;
-    _beta = beta;
-    _k = k;
-  }
-  /// Getting parameters form a local response normalization descriptor.
-  /// \param [out] local_size Value of local_size parameter.
-  /// \param [out] alpha Value of alpha parameter.
-  /// \param [out] beta Value of beta parameter.
-  /// \param [out] k Value of k parameter.
-  void get(unsigned int *local_size, float *alpha, float *beta,
-           float *k) const {
-    *local_size = _local_size;
-    *alpha = _alpha;
-    *beta = _beta;
-    *k = _k;
-  }
-  /// Setting the local size parameter of a local response normalization
-  /// descriptor.
-  /// \param [in] local_size Value of local_size parameter.
-  void set_local_size(unsigned int local_size) { _local_size = local_size; }
-  /// Setting the alpha parameter of a local response normalization descriptor.
-  /// \param [in] alpha Value of alpha parameter.
-  void set_alpha(float alpha) { _alpha = alpha; }
-  /// Setting the beta parameter of a local response normalization descriptor.
-  /// \param [in] beta Value of beta parameter.
-  void set_beta(float beta) { _beta = beta; }
-  /// Setting the k parameter of a local response normalization descriptor.
-  /// \param [in] k Value of k parameter.
-  void set_k(float k) { _k = k; }
-  /// Getting the local size parameter from a local response normalization
-  /// descriptor.
-  /// \param [out] local_size Value of local_size parameter.
-  unsigned int get_local_size() const { return _local_size; }
-  /// Getting the alpha parameter from a local response normalization
-  /// descriptor.
-  /// \param [out] alpha Value of alpha parameter.
-  float get_alpha() const { return _alpha; }
-  /// Getting the beta parameter from a local response normalization descriptor.
-  /// \param [out] beta Value of beta parameter.
-  float get_beta() const { return _beta; }
-  /// Getting the k parameter from a local response normalization descriptor.
-  /// \param [out] k Value of k parameter.
-  float get_k() const { return _k; }
-};
-
-/// An enum class representing softmax algorithm.
-enum class softmax_algorithm { normal, log };
-/// An enum class representing softmax mode.
-enum class softmax_mode { instance, channel };
-
-/// A class holding description for a pooling operation.
-class pooling_desc {
-  ::dnnl::algorithm _alg;
-  std::vector<int64_t> _stride;
-  std::vector<int64_t> _kernel;
-  std::vector<int64_t> _padding;
-
-public:
-  /// Setting a 2D pooling descriptor with given parameters.
-  /// \param [in] alg Pooling algorithm.
-  /// \param [in] kernel_h Value of height of kernel.
-  /// \param [in] kernel_w Value of width of kernel.
-  /// \param [in] padding_h Value of height of padding.
-  /// \param [in] padding_w Value of width of padding.
-  /// \param [in] stride_h Value of height of stride.
-  /// \param [in] stride_w Value of width of stride.
-  void set(::dnnl::algorithm alg, int kernel_h, int kernel_w, int padding_h,
-           int padding_w, int stride_h, int stride_w) {
-    _alg = alg;
-    _stride = {stride_h, stride_w};
-    _kernel = {kernel_h, kernel_w};
-    _padding = {padding_h, padding_w};
-  }
-  /// Setting a ND pooling descriptor with given parameters.
-  /// \param [in] alg Pooling algorithm.
-  /// \param [in] ndims Dimension of the pooling operation.
-  /// \param [in] kernel Array of dimension ndims containing the kernel size of
-  /// each dimension.
-  /// \param [in] padding Array of dimension ndims containing the padding size of
-  /// each dimension.
-  /// \param [in] stride Array of dimension ndims containing the stride size of
-  /// each dimension.
-  void set(::dnnl::algorithm alg, int ndims, int kernel[], int padding[],
-           int stride[]) {
-    _alg = alg;
-    _stride = std::vector<int64_t>(stride, stride + ndims);
-    _kernel = std::vector<int64_t>(kernel, kernel + ndims);
-    _padding = std::vector<int64_t>(padding, padding + ndims);
-  }
-  /// Getting parameters from a 2D pooling descriptor.
-  /// \param [out] alg Pooling algorithm.
-  /// \param [out] kernel_h Value of height of kernel.
-  /// \param [out] kernel_w Value of width of kernel.
-  /// \param [out] padding_h Value of height of padding.
-  /// \param [out] padding_w Value of width of padding.
-  /// \param [out] stride_h Value of height of stride.
-  /// \param [out] stride_w Value of width of stride.
-  void get(::dnnl::algorithm *alg, int *kernel_h, int *kernel_w, int *padding_h,
-           int *padding_w, int *stride_h, int *stride_w) const {
-    *alg = _alg;
-    *kernel_h = _kernel[0];
-    *kernel_w = _kernel[1];
-    *padding_h = _padding[0];
-    *padding_w = _padding[1];
-    *stride_h = _stride[0];
-    *stride_w = _stride[1];
-  }
-  /// Getting parameters from a ND pooling descriptor.
-  /// \param [in] requested_ndims Requested number of dimensions to get from a
-  /// given pooling descriptor.
-  /// \param [out] alg Pooling algorithm.
-  /// \param [out] ndims Dimension of the pooling operation.
-  /// \param [out] kernel Array of dimension ndims containing the kernel size of
-  /// each dimension.
-  /// \param [out] padding Array of dimension ndims containing the padding size
-  /// of each dimension.
-  /// \param [out] stride Array of dimension ndims containing the stride size of
-  /// each dimension.
-  void get(int requested_ndims, ::dnnl::algorithm *alg, int *ndims,
-           int kernel[], int padding[], int stride[]) const {
-    *alg = _alg;
-    *ndims = _stride.size();
-    for (int i = 0; i < requested_ndims; i++) {
-      kernel[i] = _kernel[i];
-      padding[i] = _padding[i];
-      stride[i] = _stride[i];
-    }
-  }
-  /// Setting the algorithm parameter of a pooling descriptor.
-  /// \param [in] alg Pooling algorithm.
-  void set_algorithm(::dnnl::algorithm alg) { _alg = alg; }
-  /// Setting the stride parameter of a pooling descriptor.
-  /// \param [in] stride Array of dimension ndims containing the stride size of
-  /// each dimension.
-  void set_stride(const std::vector<int64_t> &stride) { _stride = stride; }
-  /// Setting the kernel parameter of a pooling descriptor.
-  /// \param [in] kernel Array of dimension ndims containing the kernel size of
-  /// each dimension.
-  void set_kernel(const std::vector<int64_t> &kernel) { _kernel = kernel; }
-  /// Setting the padding parameter of a pooling descriptor.
-  /// \param [in] padding Array of dimension ndims containing the padding size
-  /// of each dimension.
-  void set_padding(const std::vector<int64_t> &padding) { _padding = padding; }
-
-  /// Getting the algorithm parameter from a pooling descriptor.
-  /// \param [out] alg Pooling algorithm.
-  ::dnnl::algorithm get_algorithm() const { return _alg; }
-  /// Getting the stride parameter from a pooling descriptor.
-  /// \returns Array of dimension ndims containing the stride size of each
-  /// dimension.
-  const std::vector<int64_t> &get_stride() const { return _stride; }
-  /// Getting the kernel parameter from a pooling descriptor.
-  /// \returns Array of dimension ndims containing the kernel size of each
-  /// dimension.
-  const std::vector<int64_t> &get_kernel() const { return _kernel; }
-  /// Getting the padding parameter from a pooling descriptor.
-  /// \returns Array of dimension ndims containing the padding size of each
-  /// dimension.
-  const std::vector<int64_t> &get_padding() const { return _padding; }
-  /// Getting the output dimensions of a memory after 2D pooling has been
-  /// applied.
-  /// \param [in] desc Input memory descriptor.
-  /// \param [out] out_n Number of images.
-  /// \param [out] out_c Number of channels.
-  /// \param [out] out_h Height of images.
-  /// \param [out] out_w Width of images.
-  void get_forward_output_dim(const memory_desc_ext &desc, int *out_n,
-                              int *out_c, int *out_h, int *out_w) const {
-    auto dims = desc.get_dims();
-    *out_n = dims[0];
-    *out_c = dims[1];
-    *out_h = 1 + (dims[2] + 2 * _padding[0] - _kernel[0]) / _stride[0];
-    *out_w = 1 + (dims[3] + 2 * _padding[1] - _kernel[1]) / _stride[1];
-  }
-  /// Getting the output dimensions of a memory after ND pooling has been
-  /// applied.
-  /// \param [in] desc Input memory descriptor.
-  /// \param [out] ndims Dimension of the memory.
-  /// \param [out] out_dims Array of dimension requested_ndims that contain
-  /// the size of each memory dimension.
-  void get_forward_output_dim(const memory_desc_ext &desc, int ndims,
-                              int out_dims[]) const {
-    assert(ndims >= 4 && "ndims is at least 4.");
-    auto dims = desc.get_dims();
-    out_dims[0] = dims[0];
-    out_dims[1] = dims[1];
-    for (int i = 2; i < ndims; i++) {
-      out_dims[i] =
-          1 + (dims[i] + 2 * _padding[i - 2] - _kernel[i - 2]) / _stride[i - 2];
-    }
-  }
-};
-
-/// An enum class representing reduction operations.
-enum class reduction_op {
-  max,
-  min,
-  sum,
-  mul,
-  mean,
-  amax,
-  mul_no_zeros,
-  norm1,
-  norm2
-};
-
-/// An enum class representing batch normalization mode.
-enum class batch_normalization_mode { per_activation, spatial };
-
-/// An enum class representing batch normalization operations.
-enum class batch_normalization_ops { none, activation, add_activation };
-
-/// An enum class representing binary operations.
-enum class binary_op { add, sub, mul, div, min, max, sqrt, neg };
-
-/// An struct representing convolution algorithm infomation.
-struct convolution_algorithm_info {
-  ::dnnl::algorithm algo = ::dnnl::algorithm::convolution_auto;
-  int status = 0;
-};
-
-/// A class holding description for a convolution operation.
-class convolution_desc {
-  std::vector<int64_t> _strides;
-  std::vector<int64_t> _dilates;
-  std::vector<int64_t> _paddings;
-  int _group_count = 1;
-  ::dnnl::fpmath_mode _math_mode = ::dnnl::fpmath_mode::strict;
-public:
-  /// Setting a group count to be used in the convolution.
-  /// \param [in] group_count Value of group count.
-  void set_group_count(int group_count) { _group_count = group_count; }
-  /// Getting a group count specified in the given convolution descriptor.
-  /// \returns Value of group count.
-  int get_group_count() { return _group_count; }
-  /// Setting floating point math mode to be used in the convolution.
-  /// \param [in] math_mode Value of math_mode.
-  void set_math_mode(::dnnl::fpmath_mode math_mode) { _math_mode = math_mode; }
-  /// Getting floating point math mode specified in the given convolution descriptor.
-  /// \returns Value of math mode.
-  ::dnnl::fpmath_mode get_math_mode() { return _math_mode; }
-  /// Setting a 2D convolution descriptor with given parameters.
-  /// \param [in] padding_h Value of height of padding.
-  /// \param [in] padding_w Value of width of padding.
-  /// \param [in] stride_h Value of height of stride.
-  /// \param [in] stride_w Value of width of stride.
-  /// \param [in] dilate_h Value of height of dilate.
-  /// \param [in] dilate_w Value of width of dilate.
-  void set(int padding_h, int padding_w, int stride_h, int stride_w,
-           int dilate_h, int dilate_w) {
-    _strides = {stride_h, stride_w};
-    _dilates = {dilate_h - 1, dilate_w - 1};
-    _paddings = {padding_h, padding_w};
-  }
-  /// Setting a ND convolution descriptor with given parameters.
-  /// \param [in] ndims Dimension of the convolution operation.
-  /// \param [in] paddings Array of dimension ndims containing the padding size of
-  /// each dimension.
-  /// \param [in] strides Array of dimension ndims containing the stride size of
-  /// each dimension.
-  /// \param [in] dilates Array of dimension ndims containing the kernel size of
-  /// each dimension.
-  void set(int ndims, int paddings[], int strides[], int dilates[]) {
-    _strides = std::vector<int64_t>(strides, strides + ndims);
-    _paddings = std::vector<int64_t>(paddings, paddings + ndims);
-    _dilates = std::vector<int64_t>(dilates, dilates + ndims);
-    for (auto &dilate : _dilates) {
-      dilate--;
-    }
-  }
-  /// Getting parameters from a 2D convolution descriptor.
-  /// \param [out] padding_h Value of height of padding.
-  /// \param [out] padding_w Value of width of padding.
-  /// \param [out] stride_h Value of height of stride.
-  /// \param [out] stride_w Value of width of stride.
-  /// \param [out] dilate_h Value of height of dilate.
-  /// \param [out] dilate_w Value of width of dilate.
-  void get(int *padding_h, int *padding_w, int *stride_h, int *stride_w,
-           int *dilate_h, int *dilate_w) const {
-    *dilate_h = _dilates[0];
-    *dilate_w = _dilates[1];
-    *padding_h = _paddings[0];
-    *padding_w = _paddings[1];
-    *stride_h = _strides[0];
-    *stride_w = _strides[1];
-  }
-  /// Getting parameters from a ND convolution descriptor.
-  /// \param [in] requested_ndims Requested number of dimensions to get from a
-  /// given convolution descriptor.
-  /// \param [out] ndims Dimension of the pooling operation.
-  /// \param [out] paddings Array of dimension ndims containing the padding size
-  /// of each dimension.
-  /// \param [out] strides Array of dimension ndims containing the stride size of
-  /// each dimension.
-  /// \param [out] dilates Array of dimension ndims containing the dilate size of
-  /// each dimension.
-  void get(int requested_ndims, int *ndims, int paddings[], int strides[],
-           int dilates[]) const {
-    *ndims = _strides.size();
-    for (int i = 0; i < requested_ndims; i++) {
-      dilates[i] = _dilates[i];
-      paddings[i] = _paddings[i];
-      strides[i] = _strides[i];
-    }
-  }
-  /// Getting the stride parameter from a convolution descriptor.
-  /// \returns Array of dimension ndims containing the stride size of each
-  /// dimension.
-  const std::vector<int64_t> &get_stride() const { return _strides; }
-  /// Getting the kernel parameter from a convolution descriptor.
-  /// \returns Array of dimension ndims containing the dilate size of each
-  /// dimension.
-  const std::vector<int64_t> &get_dilate() const { return _dilates; }
-  /// Getting the padding parameter from a convolution descriptor.
-  /// \returns Array of dimension ndims containing the padding size of each
-  /// dimension.
-  const std::vector<int64_t> &get_padding() const { return _paddings; }
-  /// Getting the output dimensions of a memory after 2D convolution has been
-  /// applied.
-  /// \param [in] desc Input memory descriptor.
-  /// \param [in] weight_desc Input weight memory descriptor.
-  /// \param [out] out_n Number of images.
-  /// \param [out] out_c Number of channels.
-  /// \param [out] out_h Height of images.
-  /// \param [out] out_w Width of images.
-  void get_forward_output_dim(const memory_desc_ext &desc,
-                              const memory_desc_ext &weight_desc, int *out_n,
-                              int *out_c, int *out_h, int *out_w) const {
-    auto dims = desc.get_dims();
-    auto weight_dims = weight_desc.get_dims();
-    *out_n = dims[0];
-    *out_c = weight_dims[0];
-    *out_h = 1 + (dims[2] + 2 * _paddings[0] -
-                  (1 + (_dilates[0] * (weight_dims[2] - 1)))) /
-                     _strides[0];
-    *out_w = 1 + (dims[3] + 2 * _paddings[1] -
-                  (1 + (_dilates[1] * (weight_dims[3] - 1)))) /
-                     _strides[1];
-  }
-  /// Getting the output dimensions of a memory after ND convolution has been
-  /// applied.
-  /// \param [in] desc Input memory descriptor.
-  /// \param [in] weight_desc Input weight memory descriptor.
-  /// \param [out] ndims Dimension of the memory.
-  /// \param [out] out_dims Array of dimension requested_ndims that contain
-  /// the size of each memory dimension.
-  void get_forward_output_dim(const memory_desc_ext &desc,
-                              const memory_desc_ext &weight_desc, int ndims,
-                              int out_dims[]) const {
-    assert(ndims >= 4 && "ndims is at least 4.");
-    auto dims = desc.get_dims();
-    auto weight_dims = weight_desc.get_dims();
-    out_dims[0] = dims[0];
-    out_dims[1] = weight_dims[1];
-    for (int i = 2; i < ndims; i++) {
-      out_dims[i] = 1 + (dims[i] + 2 * _paddings[i - 2] -
-                         (1 + (_dilates[i - 2] * (weight_dims[i] - 1)))) /
-                            _strides[i - 2];
-    }
-  }
-
-  convolution_desc &operator=(std::nullptr_t) {
-    return *this = convolution_desc();
-  }
-
-  operator bool() const {
-    return !(_strides.size() == 0
-             && _dilates.size() == 0
-             && _paddings.size() == 0);
-  }
-};
-
-/// An enum class representing rnn mode.
-enum class rnn_mode { vanilla_relu, vanilla_tanh, lstm, gru };
-
-/// An enum class representing rnn bias mode.
-enum class rnn_bias_mode { none, single };
-
-/// An enum class representing rnn direction.
-enum class rnn_direction {unidirectional, bidirectional};
-
-/// A class holding description for a RNN operation.
-class rnn_desc {
-  rnn_mode _mode;
-  rnn_bias_mode _bias_mode;
-  rnn_direction _direction;
-  dpct::library_data_t _dt;
-  int _input_size;
-  int _hidden_size;
-  int _projection_size;
-  int _layer_size;
-
-public:
-  void set(rnn_mode mode, rnn_bias_mode bias_mode, rnn_direction direction,
-           dpct::library_data_t dt, int input_size, int hidden_size,
-           int projection_size, int layer_size) {
-    _mode = mode;
-    _bias_mode = bias_mode;
-    _direction = direction;
-    _input_size = input_size;
-    _hidden_size = hidden_size;
-    _projection_size = projection_size;
-    _layer_size = layer_size;
-    _dt = dt;
-  }
-  void get(rnn_mode *mode, rnn_bias_mode *bias_mode, rnn_direction *direction,
-           dpct::library_data_t *dt, int *input_size, int *hidden_size,
-           int *projection_size, int *layer_size) const {
-    *mode = _mode;
-    *bias_mode = _bias_mode;
-    *direction = _direction;
-    *input_size = _input_size;
-    *hidden_size = _hidden_size;
-    *projection_size = _projection_size;
-    *layer_size = _layer_size;
-    *dt = _dt;
-  }
-};
-
-/// A class holding description for a Dropout operation.
-class dropout_desc {
-  struct dropout_desc_imp {
-    float _p = 0.5f;
-    unsigned long long _seed = 1;
-    void *_state = nullptr;
-    std::vector<std::uint8_t> _host_state;
-    rng_engine_t _rng_engine;
-    dropout_desc_imp() : _rng_engine(dpct::get_default_queue(), 1) {}
-  };
-  std::shared_ptr<dropout_desc_imp> _imp;
-
-  void generate(sycl::queue *q, std::int64_t required_state_size,
-                std::int64_t num, void *buffer) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-    sycl::event e_gen = oneapi::mkl::rng::generate(
-        oneapi::mkl::rng::bernoulli<std::int32_t>(1.f - _imp->_p),
-        _imp->_rng_engine, num, (std::int32_t *)buffer);
-    sycl::event e_save = q->submit([&](sycl::handler &cgh) {
-      cgh.depends_on(e_gen);
-      cgh.host_task([=] {
-        oneapi::mkl::rng::save_state(_imp->_rng_engine,
-                                     _imp->_host_state.data());
-      });
-    });
-    q->memcpy(_imp->_state, _imp->_host_state.data(), required_state_size,
-              e_save);
-#endif
-  }
-public:
-  operator bool() const {
-    return bool(_imp);
-  }
-  dropout_desc &operator=(std::nullptr_t) {
-    _imp.reset();
-    return *this;
-  }
-  /// Initializing a dropout descriptor.
-  void init(){
-    _imp = std::make_shared<dropout_desc_imp>();
-  }
-  /// Setting a dropout descriptor with given parameters.
-  /// \param [in] engine Engine of the dropout operation.
-  /// \param [in] p Probability of value set to zero.
-  /// \param [in] state Memory that store random generator state.
-  /// \param [in] state_size Required size to store random generator state.
-  /// \param [in] seed Seed to initialize conditions of the generator state.
-  void set(engine_ext &engine, float p, void *state, size_t state_size,
-           unsigned long long seed);
-  /// Getting parameters from a dropout descriptor.
-  /// \param [in] engine Engine of the dropout operation.
-  /// \param [in] p Probability of value set to zero.
-  /// \param [in] state Memory that store random generator state.
-  /// \param [in] seed Seed to initialize conditions of the generator state.
-  void get(float *p, void **states, unsigned long long *seed) const noexcept {
-    *seed = _imp->_seed;
-    *states = _imp->_state;
-    *p = _imp->_p;
-  }
-  /// Getting the probability of value set to zero.
-  /// \returns Probability.
-  float get_probability() const noexcept { return _imp->_p; }
-  /// Restoreing a dropout descriptor from stored state.
-  /// \param [in] engine Engine of the dropout operation.
-  /// \param [in] p Probability of value set to zero.
-  /// \param [in] state Memory that store random generator state.
-  /// \param [in] state_size Required size to store random generator state.
-  /// \param [in] seed Seed to initialize conditions of the generator state.
-  void restore(engine_ext &engine, float p, void *state, size_t state_size,
-               unsigned long long seed);
-  friend class engine_ext;
-};
-
-namespace detail {
-typedef std::string primitive_cache_key_type;
-typedef std::list<primitive_cache_key_type> usage_list_type;
-struct primitive_cache_value_type {
-  ::dnnl::primitive *_primitive;
-  std::unordered_map<int, ::dnnl::memory> *_args;
-  usage_list_type::iterator _usage_it;
-  std::function<void(::dnnl::primitive *)> _destructor;
-  sycl::event _e;
-  sycl::queue _q;
-  primitive_cache_value_type(
-      ::dnnl::primitive *primitive,
-      std::unordered_map<int, ::dnnl::memory> *args,
-      usage_list_type::iterator usage_it,
-      std::function<void(::dnnl::primitive *)> destructor, sycl::event e,
-      sycl::queue q)
-      : _primitive(primitive), _args(args), _usage_it(usage_it),
-        _destructor(destructor), _e(e), _q(q) {}
-};
-struct primitive_and_args {
-  ::dnnl::primitive *primitive;
-  std::unordered_map<int, ::dnnl::memory> *args;
-};
-typedef std::unordered_map<primitive_cache_key_type,
-                           std::shared_ptr<primitive_cache_value_type>>
-    cache_map_type;
-
-// The primitive cache uses LRU replacement policy, and the default cache
-// capacity is 1024.
-class primitive_cache {
-  int _capacity = 1024;
-  usage_list_type usage;
-  cache_map_type cache_map;
-  void touch(cache_map_type::iterator it, sycl::event e = {},
-             bool update_event = false) {
-    if (it->second->_usage_it != usage.begin()) {
-      const primitive_cache_key_type &key = it->first;
-      usage.erase(it->second->_usage_it);
-      usage.push_front(key);
-      it->second->_usage_it = usage.begin();
-    }
-    if (update_event) {
-      it->second->_e = e;
-    }
-  }
-
-public:
-  std::shared_ptr<primitive_cache_value_type>
-  get(const primitive_cache_key_type &key) {
-    auto it = cache_map.find(key);
-    if (it == cache_map.end()) {
-      return nullptr;
-    }
-    touch(it);
-    return it->second;
-  }
-  void put(const primitive_cache_key_type &key, ::dnnl::primitive *value,
-           std::unordered_map<int, ::dnnl::memory> *args,
-           std::function<void(::dnnl::primitive *)> destructor, sycl::event e,
-           sycl::queue *q) {
-    auto it = cache_map.find(key);
-    if (it != cache_map.end()) {
-      touch(it, e, true);
-    } else {
-      if (cache_map.size() == _capacity) {
-        auto v = *(cache_map.find(usage.back())->second);
-        v._q.submit([=](sycl::handler &cgh) {
-          cgh.depends_on(v._e);
-          cgh.host_task([=] {
-            delete v._args;
-            v._destructor(v._primitive);
-          });
-        });
-        cache_map.erase(usage.back());
-        usage.pop_back();
-      }
-      usage.push_front(key);
-      cache_map[key] = std::make_shared<primitive_cache_value_type>(
-          value, args, usage.begin(), destructor, e, *q);
-    }
-  }
-};
-} // namespace detail
-
-/// A class holding the oneDNN engine.
-class engine_ext {
-  struct output_argument_info {
-    float _alpha;
-    float _beta;
-    int _name;
-    memory_desc_ext _desc;
-    void *_data;
-    output_argument_info(float alpha, float beta, int name,
-                         memory_desc_ext desc, void *data)
-        : _alpha(alpha), _beta(beta), _name(name), _desc(desc), _data(data) {}
-    output_argument_info(float alpha, float beta, memory_desc_ext desc,
-                         void *data)
-        : _alpha(alpha), _beta(beta), _name(0), _desc(desc), _data(data) {}
-  };
-  struct buffer_info {
-    size_t capacity = 0;
-    uint8_t *buffer = nullptr;
-    size_t usage = 0;
-    sycl::queue q;
-    sycl::event deps;
-    size_t primitive_depth = 0;
-  };
-  struct internal_resource {
-    std::int64_t random_engine_state_size = -1;
-    buffer_info binfo;
-  };
-  std::shared_ptr<::dnnl::engine> _eng = nullptr;
-  std::shared_ptr<::dnnl::stream> _s = nullptr;
-  sycl::queue *_q = nullptr;
-  unsigned int _engine_id = 0;
-  static thread_local unsigned int _engine_count;
-  static thread_local std::map<void *, ::dnnl::memory> _workspace_map;
-  static thread_local std::map<sycl::queue *,
-                               std::shared_ptr<internal_resource>>
-      _internal_resource_cache;
-  static thread_local detail::primitive_cache _primitive_cache;
-  ::dnnl::memory &get_workspace(void *key) { return _workspace_map[key]; }
-  void insert_workspace(void *key, ::dnnl::memory workspace) {
-    _workspace_map[key] = workspace;
-  }
-  const ::dnnl::stream &get_stream() const { return *_s; }
-  const ::dnnl::engine &get_engine() const { return *_eng; }
-
-  void *allocate(const memory_desc_ext &desc, int count = 1);
-  void *allocate(size_t size);
-  std::shared_ptr<internal_resource> get_internal_resource(sycl::queue *q){
-    auto it = _internal_resource_cache.find(_q);
-    if (it == _internal_resource_cache.end()) {
-      return _internal_resource_cache[_q] = std::make_shared<internal_resource>();
-    }
-    return it->second;
-  }
-  void enter_primitive(size_t request_buffer_size = 0) {
-    auto &info = get_internal_resource(_q)->binfo;
-    if (info.primitive_depth == 0) {
-      info.usage = 0;
-      if (request_buffer_size > info.capacity) {
-        if (info.buffer && (info.capacity != 0)) {
-          auto ainfo = info;
-          ainfo.q.submit([=](sycl::handler &cgh) {
-            cgh.depends_on(ainfo.deps);
-            cgh.host_task([=] { sycl::free(ainfo.buffer, ainfo.q); });
-          });
-        }
-        size_t new_buffer_capacity =
-            std::max(request_buffer_size, info.capacity * 2);
-        info.capacity = new_buffer_capacity;
-        info.buffer = (uint8_t *)sycl::malloc_device(new_buffer_capacity, *_q);
-        info.q = *_q;
-        info.deps = sycl::event();
-      }
-    }
-    info.primitive_depth++;
-  }
-  sycl::event exit_primitive(const sycl::event &e) {
-    auto &info = get_internal_resource(_q)->binfo;
-    info.primitive_depth--;
-    if ((info.primitive_depth == 0) && info.usage) {
-      info.deps = e;
-    }
-    return e;
-  }
-  ::dnnl::memory::desc
-  compress_spatial_dimensions_to_channel(const ::dnnl::memory::desc &desc);
-  ::dnnl::memory::desc
-  get_bn_scale_bias_mean_var_desc(const ::dnnl::memory::desc &desc,
-                                  batch_normalization_mode mode);
-  sycl::event batch_normalization_backward_internal(
-      batch_normalization_mode mode, float epsilon, float alpha_data,
-      const memory_desc_ext &src_desc, void *src,
-      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
-      const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param,
-      const memory_desc_ext &diff_scale_bias_desc, void *scale, void *bias,
-      float beta_param, void *diff_scale, void *diff_bias,
-      const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var);
-  sycl::event batch_normalization_forward_internal(
-      bool is_infer, batch_normalization_mode mode, float epsilon, float factor,
-      float alpha, const memory_desc_ext &src_desc, void *src, float beta,
-      const memory_desc_ext &dst_desc, void *dst,
-      const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
-      const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var,
-      void *running_mean, void *running_var);
-  ::dnnl::memory::desc
-  transfer_memory_desc_to_channel_major_format(const ::dnnl::memory::desc &desc);
-  ::dnnl::memory::desc
-  bn_reorder_memory_to_channel_major_format(
-      bool is_input, ::dnnl::memory::desc &desc, void *src, void **cache);
-  ::dnnl::memory::desc
-  transfer_memory_desc_to_format_tag_any(const ::dnnl::memory::desc &desc){
-    return ::dnnl::memory::desc(desc.get_dims(), desc.get_data_type(),
-                                ::dnnl::memory::format_tag::any);
-  }
-  void allocate_and_reorder_memory_to_optimal(::dnnl::memory::desc &from_desc,
-                                              void *&from,
-                                              ::dnnl::memory::desc &to_desc,
-                                              void *&to) {
-    if (from_desc != to_desc) {
-      to = allocate(to_desc);
-      async_reorder(1.f, from_desc, from, 0.f, to_desc, to);
-    }
-  }
-  template <typename primitive_type, typename... args_type>
-  std::pair<detail::primitive_cache_key_type, detail::primitive_and_args>
-  create_primitive_args_or_get(args_type &&...args);
-  template <typename primitive_type>
-  typename primitive_type::primitive_desc
-  get_primitive_desc(::dnnl::primitive *p);
-  template <typename primitive_type, typename... args_type>
-  typename primitive_type::primitive_desc
-  create_primitive_desc(args_type &&...args);
-  template <typename T>
-  void generate_cache_key(std::string &key_buffer, const T &arg);
-  template <typename T, typename... args_type>
-  void generate_cache_key(std::string &key_buffer, const T &first_arg,
-                          const args_type &...args);
-  void insert_arg(std::unordered_map<int, ::dnnl::memory> *args, int name,
-                  const ::dnnl::memory::desc &desc, void *data) {
-    auto it = args->find(name);
-    if (it != args->end()) {
-      it->second.set_data_handle(data);
-    } else {
-      args->insert({name, ::dnnl::memory(desc, *_eng, data)});
-    }
-  }
-  void insert_arg(std::unordered_map<int, ::dnnl::memory> *args, int name,
-                  const ::dnnl::memory &mem) {
-    (*args)[name] = mem;
-  }
-  sycl::event execute_rnn_forward_primitive(
-      rnn_mode mode, ::dnnl::prop_kind kind, ::dnnl::rnn_direction direction,
-      rnn_bias_mode bias_mode, ::dnnl::memory::data_type dt,
-      ::dnnl::memory::format_tag tag, int seq_length, int batch_size, int src_c,
-      int dst_c, int layer_size, int direction_num, int hidden_size,
-      int gate_num, int projection_size, std::vector<void *> &data,
-      std::vector<int> &offset, int iter_num, size_t *weight_size = nullptr,
-      size_t *workspace_size = nullptr, size_t *scratchpad_size = nullptr);
-
-  sycl::event rnn_forward_internal(
-      const rnn_desc &desc, ::dnnl::prop_kind kind,
-      const memory_desc_ext &src_desc, void *src,
-      const memory_desc_ext &dst_desc, void *dst,
-      const memory_desc_ext &iter_desc, void *src_iter, void *dst_iter,
-      const memory_desc_ext &iter_c_desc, void *src_iter_c, void *dst_iter_c,
-      size_t weight_size, void *weight, size_t workspace_size, void *workspace,
-      size_t scratchpad_size, void *scratchpad, bool is_get_execution_args,
-      size_t *weight_size_query, size_t *workspace_size_query,
-      size_t *scratchpad_size_query);
-
-  sycl::event execute_rnn_backward_primitive(
-      rnn_mode mode, ::dnnl::rnn_direction direction, rnn_bias_mode bias_mode,
-      ::dnnl::memory::data_type dt, ::dnnl::memory::format_tag tag,
-      int seq_length, int batch_size, int src_c, int dst_c, int layer_size,
-      int direction_num, int hidden_size, int gate_num, int projection_size,
-      std::vector<void *> &data, std::vector<int> &offset, int iter_num);
-  bool
-  scale_parameter_preprocess(const std::vector<output_argument_info> &args);
-  template <typename primitive_type>
-  sycl::event
-  execute_primitive(const std::pair<detail::primitive_cache_key_type,
-                                    detail::primitive_and_args> &primitive,
-                    const std::vector<output_argument_info> &extra_args = {});
-  template <typename T>
-  sycl::event fill_with_type(sycl::queue *q, void *src, const void *value,
-                             size_t size_with_byte) {
-    return q->fill<T>(static_cast<T *>(src), *static_cast<const T *>(value),
-                      size_with_byte / sizeof(T));
-  }
-  template <typename T> struct no_zero_op {
-    T operator()(T e) {
-      if (!e) {
-        return 1;
-      }
-      return e;
-    }
-  };
-  template <typename T>
-  void transform_no_zero_with_type(sycl::queue *q, void *src, void *dst,
-                                   size_t num) {
-    std::transform(oneapi::dpl::execution::make_device_policy(*q),
-                   static_cast<T *>(src), static_cast<T *>(src) + num,
-                   static_cast<T *>(dst), no_zero_op<T>());
-  }
-  void transform_no_zero(const memory_desc_ext &desc, void *src, void *dst);
-  ::dnnl::memory::desc get_group_weight_desc(int group_count,
-                                             const memory_desc_ext &weight_desc);
-  void get_rnn_configuration(const ::dnnl::memory::desc &desc,
-                             rnn_direction direction, rnn_mode mode,
-                             dpct::library_data_t dt, int hidden_size,
-                             ::dnnl::memory::data_type *dnnl_dt,
-                             ::dnnl::memory::format_tag *tag,
-                             int *projection_size, int *output_size,
-                             int *seq_length, int *batch_size,
-                             int *direction_num, int *gate_num);
-public:
-  engine_ext() {}
-  operator bool() const {
-    return bool(_eng) && bool(_s) && bool(_q);
-  }
-  engine_ext &operator=(std::nullptr_t) {
-    _eng = nullptr;
-    _s = nullptr;
-    _q = nullptr;
-    return *this;
-  }
-  /// Creating oneDNN engine.
-  void create_engine() {
-    _q = &dpct::get_current_device().default_queue();
-    _eng = std::make_shared<::dnnl::engine>(::dnnl::sycl_interop::make_engine(
-        dpct::get_current_device(), dpct::get_current_device().get_context()));
-    _s = std::make_shared<::dnnl::stream>(
-        ::dnnl::sycl_interop::make_stream(*_eng, *_q));
-    _engine_id = _engine_count++;
-  }
-  /// Setting the user's SYCL queue for an oneDNN engine.
-  /// \param [in] q Pointer to the SYCL queue.
-  void set_queue(sycl::queue *q) {
-    if (!q) {
-      throw std::runtime_error("set_queue: pointer must not be nullptr.");
-    }
-    if (!_eng) {
-      throw std::runtime_error("set_queue: current engine is invalid.");
-    }
-    if (q->get_context() != ::dnnl::sycl_interop::get_context(*_eng)) {
-      throw std::runtime_error(
-          "set_queue: queue is mismatch with current engine context.");
-    }
-    _q = q;
-    _s = std::make_shared<::dnnl::stream>(
-        ::dnnl::sycl_interop::make_stream(*_eng, *_q));
-  }
-  /// Retrieving the user's SYCL queue set in the oneDNN engine.
-  /// \returns Pointer to the SYCL queue.
-  sycl::queue *get_queue() const { return _q; }
-  /// Setting all elements of a memory to a given value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] valuePtr Pointer to a single value.
-  void fill(const memory_desc_ext &src_desc, void *src,
-                   const void *valuePtr);
-  /// Coping the scaled data from a memory to another memory with a different
-  /// description.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  void reorder(float alpha, const memory_desc_ext &src_desc, void *src,
-                      float beta, const memory_desc_ext &dst_desc, void *dst);
-  /// Scaling all the elements of a memory by a given factor.
-  /// \param [in] alpha Value to scaling factors.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [out] src Pointer to source data.
-  void scale(float alpha, const memory_desc_ext &src_desc, void *src);
-  /// Adding the scaled values of a memory to another memory.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  void sum(float alpha, const memory_desc_ext &src_desc, void *src,
-                  float beta, const memory_desc_ext &dst_desc, void *dst);
-  /// Computing a specified activation function value.
-  /// \param [in] desc Activation descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  void activation_forward(activation_desc &desc, float alpha,
-                                 const memory_desc_ext &src_desc, void *src,
-                                 float beta, const memory_desc_ext &dst_desc,
-                                 void *dst);
-  /// Computing the gradient of a specified activation function.
-  /// \param [in] desc Activation descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the differential destination memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  void
-  activation_backward(activation_desc &desc, float alpha,
-                      const memory_desc_ext &dst_desc, void *dst,
-                      const memory_desc_ext &diff_dst_desc, void *diff_dst,
-                      const memory_desc_ext &src_desc, void *src, float beta,
-                      const memory_desc_ext &diff_src_desc, void *diff_src);
-  /// Computing a specified pooling function value.
-  /// \param [in] desc Pooling descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [out] workspace Pointer to workspace generated from forward propagation.
-  void pooling_forward(pooling_desc &desc, float alpha,
-                              const memory_desc_ext &src_desc, void *src,
-                              float beta, const memory_desc_ext &dst_desc,
-                              void *dst, ::dnnl::memory *workspace = nullptr);
-  /// Computing the gradient of a specified pooling function.
-  /// \param [in] desc Activation descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the differential destination memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential
-  /// source data. 
-  /// \param [in] workspace Pointer to workspace used for backward
-  /// propagation.
-  void pooling_backward(pooling_desc &desc, float alpha,
-                               const memory_desc_ext &dst_desc, void *dst,
-                               const memory_desc_ext &diff_dst_desc,
-                               void *diff_dst, const memory_desc_ext &src_desc,
-                               void *src, float beta,
-                               const memory_desc_ext &diff_src_desc,
-                               void *diff_src,
-                               ::dnnl::memory *workspace = nullptr);
-  /// Computing a specified softmax function value.
-  /// \param [in] alg Softmax algorithm.
-  /// \param [in] mode Softmax mode.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value. 
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data. 
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  void softmax_forward(softmax_algorithm alg, softmax_mode mode,
-                              float alpha, const memory_desc_ext &src_desc,
-                              void *src, float beta,
-                              const memory_desc_ext &dst_desc, void *dst);
-  /// Computing the gradient of a specified softmax function.
-  /// \param [in] alg Softmax algorithm.
-  /// \param [in] mode Softmax mode.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value. 
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the differential destination memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  void softmax_backward(softmax_algorithm alg, softmax_mode mode,
-                               float alpha, const memory_desc_ext &dst_desc,
-                               void *dst, const memory_desc_ext &diff_dst_desc,
-                               void *diff_dst, float beta,
-                               const memory_desc_ext &diff_src_desc,
-                               void *diff_src);
-  /// Computing a specified local response normalization function value.
-  /// \param [in] desc Local response normalization descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [out] workspace Pointer to workspace generated from forward
-  /// propagation.
-  void lrn_forward(lrn_desc &desc, float alpha,
-                          const memory_desc_ext &src_desc, void *src,
-                          float beta, const memory_desc_ext &dst_desc,
-                          void *dst, ::dnnl::memory *workspace = nullptr);
-  /// Computing the gradient of a specified local response normalization
-  /// function.
-  /// \param [in] desc Local response normalization descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed value.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the differential destination memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \param [in] workspace Pointer to workspace used for backward propagation.
-  void lrn_backward(lrn_desc &desc, float alpha,
-                           const memory_desc_ext &dst_desc, void *dst,
-                           const memory_desc_ext &diff_dst_desc, void *diff_dst,
-                           const memory_desc_ext &src_desc, void *src,
-                           float beta, const memory_desc_ext &diff_src_desc,
-                           void *diff_src, ::dnnl::memory *workspace = nullptr);
-  /// Setting all elements of a memory to a given value asynchronously.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] valuePtr Pointer to a single value.
-  /// \returns An event representing the fill operations.
-  sycl::event async_fill(const memory_desc_ext &src_desc, void *src,
-                   const void *valuePtr);
-  /// Coping the scaled data from a memory to another memory with a different
-  /// description asynchronously.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \returns An event representing the reorder operations.
-  sycl::event async_reorder(float alpha, const memory_desc_ext &src_desc, void *src,
-                      float beta, const memory_desc_ext &dst_desc, void *dst);
-  /// Scaling all the elements of a memory by a given factor asynchronously.
-  /// \param [in] alpha Value to scaling factors.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [out] src Pointer to source data.
-  /// \returns An event representing the scale operations.
-  sycl::event async_scale(float alpha, const memory_desc_ext &src_desc, void *src);
-  /// Adding the scaled values of a memory to another memory asynchronously.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \returns An event representing the sum operations.
-  sycl::event async_sum(float alpha, const memory_desc_ext &src_desc, void *src,
-                  float beta, const memory_desc_ext &dst_desc, void *dst);
-
-  /// Perform specified binary operation asynchronously.
-  /// \param [in] op Specified binary operation.
-  /// \param [in] alpha_0 Value to scaling factors used to scale the src_0
-  /// value.
-  /// \param [in] src_desc_0 Source 0 memory descriptor.
-  /// \param [in] src_0 Pointer to source 0 data.
-  /// \param [in] alpha_1 Value to scaling factors used to scale the src_1
-  /// value.
-  /// \param [in] src_desc_1 Source 1 memory descriptor.
-  /// \param [in] src_1 Pointer to source 1 data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \returns An event representing the binary operations.
-  sycl::event async_binary(binary_op op, float alpha_0,
-                     const memory_desc_ext &src_desc_0, void *src_0,
-                     float alpha_1, const memory_desc_ext &src_desc_1,
-                     void *src_1, float beta, const memory_desc_ext &dst_desc,
-                     void *dst);
-
-  /// Perform specified binary operation asynchronously.
-  /// \param [in] op Specified reduction operation.
-  /// \param [in] alpha Value to scaling factors used to scale the data
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \returns An event representing the reduction operations.
-  sycl::event async_reduction(reduction_op op, float alpha,
-                        const memory_desc_ext &src_desc, void *src, float beta,
-                        const memory_desc_ext &dst_desc, void *dst);
-  /// Computing a specified activation function value asynchronously.
-  /// \param [in] desc Activation descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \returns An event representing the activation forward operations.
-  sycl::event async_activation_forward(activation_desc &desc, float alpha,
-                                 const memory_desc_ext &src_desc, void *src,
-                                 float beta, const memory_desc_ext &dst_desc,
-                                 void *dst);
-  /// Computing the gradient of a specified activation function asynchronously.
-  /// \param [in] desc Activation descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the differential destination memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \returns An event representing the activation backward operations.
-  sycl::event
-  async_activation_backward(activation_desc &desc, float alpha,
-                      const memory_desc_ext &dst_desc, void *dst,
-                      const memory_desc_ext &diff_dst_desc, void *diff_dst,
-                      const memory_desc_ext &src_desc, void *src, float beta,
-                      const memory_desc_ext &diff_src_desc, void *diff_src);
-  /// Computing a specified pooling function value asynchronously.
-  /// \param [in] desc Pooling descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [out] workspace Pointer to workspace generated from forward propagation.
-  /// \returns An event representing the pooling forward operations.
-  sycl::event async_pooling_forward(pooling_desc &desc, float alpha,
-                              const memory_desc_ext &src_desc, void *src,
-                              float beta, const memory_desc_ext &dst_desc,
-                              void *dst, ::dnnl::memory *workspace = nullptr);
-  /// Computing the gradient of a specified pooling function asynchronously.
-  /// \param [in] desc Activation descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the differential destination memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential
-  /// source data. 
-  /// \param [in] workspace Pointer to workspace used for backward
-  /// propagation.
-  /// \returns An event representing the pooling backward operations.
-  sycl::event async_pooling_backward(pooling_desc &desc, float alpha,
-                               const memory_desc_ext &dst_desc, void *dst,
-                               const memory_desc_ext &diff_dst_desc,
-                               void *diff_dst, const memory_desc_ext &src_desc,
-                               void *src, float beta,
-                               const memory_desc_ext &diff_src_desc,
-                               void *diff_src,
-                               ::dnnl::memory *workspace = nullptr);
-  /// Computing a specified softmax function value asynchronously.
-  /// \param [in] alg Softmax algorithm.
-  /// \param [in] mode Softmax mode.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value. 
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data. 
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \returns An event representing the softmax forward operations.
-  sycl::event async_softmax_forward(softmax_algorithm alg, softmax_mode mode,
-                              float alpha, const memory_desc_ext &src_desc,
-                              void *src, float beta,
-                              const memory_desc_ext &dst_desc, void *dst);
-  /// Computing the gradient of a specified softmax function asynchronously.
-  /// \param [in] alg Softmax algorithm.
-  /// \param [in] mode Softmax mode.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value. 
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the differential destination memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \returns An event representing the softmax backward operations.
-  sycl::event async_softmax_backward(softmax_algorithm alg, softmax_mode mode,
-                               float alpha, const memory_desc_ext &dst_desc,
-                               void *dst, const memory_desc_ext &diff_dst_desc,
-                               void *diff_dst, float beta,
-                               const memory_desc_ext &diff_src_desc,
-                               void *diff_src);
-  /// Computing a specified local response normalization function value
-  /// asynchronously.
-  /// \param [in] desc Local response normalization descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [out] workspace Pointer to workspace generated from forward
-  /// propagation.
-  /// \returns An event representing the lrn forward operations.
-  sycl::event async_lrn_forward(lrn_desc &desc, float alpha,
-                          const memory_desc_ext &src_desc, void *src,
-                          float beta, const memory_desc_ext &dst_desc,
-                          void *dst, ::dnnl::memory *workspace = nullptr);
-  /// Computing the gradient of a specified local response normalization
-  /// function asynchronously.
-  /// \param [in] desc Local response normalization descriptor.
-  /// \param [in] alpha Value to scaling factors used to scale the computed value.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the differential destination memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \param [in] workspace Pointer to workspace used for backward propagation.
-  /// \returns An event representing the lrn backward operations.
-  sycl::event async_lrn_backward(lrn_desc &desc, float alpha,
-                           const memory_desc_ext &dst_desc, void *dst,
-                           const memory_desc_ext &diff_dst_desc, void *diff_dst,
-                           const memory_desc_ext &src_desc, void *src,
-                           float beta, const memory_desc_ext &diff_src_desc,
-                           void *diff_src, ::dnnl::memory *workspace = nullptr);
-
-  /// Derives a memory descriptor for the batch normalization scale, bias, mean,
-  /// variance from the source memory descriptor and batch normalization mode.
-  /// \param [out] desc Derived memory descriptor.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] mode Batch normalization mode.
-  static void derive_batch_normalization_memory_desc(memory_desc_ext &desc,
-                                              const memory_desc_ext &src_desc,
-                                              batch_normalization_mode mode);
-
-  /// Derives a memory descriptor for the batch normalization scale, bias, mean,
-  /// variance from the source memory descriptor and batch normalization mode.
-  /// \param [out] scale_bias_desc Derived scale and bias memory descriptor.
-  /// \param [out] mean_var_desc Derived mean and var memory descriptor.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] mode Batch normalization mode.
-  static void derive_batch_normalization_memory_desc(memory_desc_ext &scale_bias_desc,
-                                             memory_desc_ext &mean_var_desc,
-                                             const memory_desc_ext &src_desc,
-                                             batch_normalization_mode mode);
-
-  /// Get the size of workspace that needed by batch normalization. The data stored
-  /// in workspace must be preserved between forward and backward.
-  /// \param [in] ops Batch normalization operation mode. This mode can set to
-  /// perform only batch normalization, or batch normalization followed by
-  /// activation, or batch normalization followed by element-wise addition and
-  /// activation.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \returns Size of workspace.
-  size_t get_batch_normalization_workspace_size(
-    batch_normalization_ops ops, const memory_desc_ext &src_desc);
-
-  /// Computing a specified batch normalization inference stage function value
-  /// asynchronously.
-  /// \param [in] mode Batch normalization mode.
-  /// \param [in] epsilon Epsilon value used in computation.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [in] scale_bias_mean_var_desc Scale, bias, mean, variance memory
-  /// descriptor.
-  /// \param [in] scale Pointer to scale data.
-  /// \param [in] bias Pointer to bias data.
-  /// \param [in] mean Pointer to mean data.
-  /// \param [in] var Pointer to variance data.
-  /// \returns An event representing the batch normalization forward operations.
-  sycl::event async_batch_normalization_forward_inference(
-      batch_normalization_mode mode, float epsilon, float alpha,
-      const memory_desc_ext &src_desc, void *src, float beta,
-      const memory_desc_ext &dst_desc, void *dst,
-      const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
-      void *mean, void *var);
-
-  /// Computing a specified batch normalization inference stage function value
-  /// asynchronously.
-  /// \param [in] mode Batch normalization mode.
-  /// \param [in] ops Batch normalization operation mode. This mode can set to
-  /// perform only batch normalization, or batch normalization followed by
-  /// activation, or batch normalization followed by element-wise addition and
-  /// activation.
-  /// \param [in] adesc Activation operation descriptor.
-  /// \param [in] epsilon Epsilon value used in computation.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [in] summand_desc Summand memory descriptor.
-  /// \param [in] summand Pointer to summand data.
-  /// \param [in] scale_bias_desc Scale, bias memory descriptor.
-  /// \param [in] scale Pointer to scale data.
-  /// \param [in] bias Pointer to bias data.
-  /// \param [in] mean_var_desc Mean, variance memory descriptor.
-  /// \param [in] mean Pointer to mean data.
-  /// \param [in] var Pointer to variance data.
-  /// \returns An event representing the batch normalization forward operations.
-  sycl::event async_batch_normalization_forward_inference(
-      batch_normalization_mode mode, batch_normalization_ops ops,
-      activation_desc &adesc, float epsilon, float alpha,
-      const memory_desc_ext &src_desc, void *src, float beta,
-      const memory_desc_ext &dst_desc, void *dst,
-      const memory_desc_ext &summand_desc, void *summand,
-      const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
-      const memory_desc_ext &mean_var_desc, void *mean, void *var);
-
-  /// Computing a specified batch normalization training stage function value
-  /// asynchronously.
-  /// \param [in] mode Batch normalization mode.
-  /// \param [in] epsilon Epsilon value used in computation.
-  /// \param [in] factor Factor value used in running mean and variance
-  /// computation.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [in] scale_bias_mean_var_desc Scale, bias, mean, variance memory
-  /// descriptor.
-  /// \param [in] scale Pointer to scale data.
-  /// \param [in] bias Pointer to bias data.
-  /// \param [out] running_mean Pointer to running mean data.
-  /// \param [out] running_var Pointer to running variance data.
-  /// \param [out] saved_mean Pointer to optional cache to save mean data.
-  /// \param [out] saved_var Pointer to optional cache to save variance data.
-  /// \returns An event representing the batch normalization forward operations.
-  sycl::event async_batch_normalization_forward_training(
-      batch_normalization_mode mode, float epsilon, float factor, float alpha,
-      const memory_desc_ext &src_desc, void *src, float beta,
-      const memory_desc_ext &dst_desc, void *dst,
-      const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
-      void *running_mean, void *running_var, void *saved_mean, void *saved_var);
-
-  /// Computing a specified batch normalization training stage function value
-  /// asynchronously.
-  /// \param [in] mode Batch normalization mode.
-  /// \param [in] ops Batch normalization operation mode. This mode can set to
-  /// perform only batch normalization, or batch normalization followed by
-  /// activation, or batch normalization followed by element-wise addition and
-  /// activation.
-  /// \param [in] adesc Activation operation descriptor.
-  /// \param [in] epsilon Epsilon value used in computation.
-  /// \param [in] factor Factor value used in running mean and variance
-  /// computation.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [in] summand_desc Summand memory descriptor.
-  /// \param [in] summand Pointer to summand data.
-  /// \param [in] scale_bias_mean_var_desc Scale, bias, mean, variance memory
-  /// descriptor.
-  /// \param [in] scale Pointer to scale data.
-  /// \param [in] bias Pointer to bias data.
-  /// \param [out] running_mean Pointer to running mean data.
-  /// \param [out] running_var Pointer to running variance data.
-  /// \param [out] saved_mean Pointer to optional cache to save mean data.
-  /// \param [out] saved_var Pointer to optional cache to save variance data.
-  /// \param [in] workspace_size Size of workspace.
-  /// \param [out] workspace Pointer to workspace generated from forward
-  /// propagation.
-  /// \returns An event representing the batch normalization forward operations.
-  sycl::event async_batch_normalization_forward_training(
-      batch_normalization_mode mode, batch_normalization_ops ops,
-      activation_desc &adesc, float epsilon, float factor, float alpha,
-      const memory_desc_ext &src_desc, void *src, float beta,
-      const memory_desc_ext &dst_desc, void *dst,
-      const memory_desc_ext &summand_desc, void *summand,
-      const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
-      void *running_mean, void *running_var, void *saved_mean, void *saved_var,
-      size_t workspace_size, void *workspace);
-
-  /// Computing a specified batch normalization training stage function value
-  /// asynchronously.
-  /// \param [in] mode Batch normalization mode.
-  /// \param [in] ops Batch normalization operation mode. This mode can set to
-  /// perform only batch normalization, or batch normalization followed by
-  /// activation, or batch normalization followed by element-wise addition and
-  /// activation.
-  /// \param [in] adesc Activation operation descriptor.
-  /// \param [in] epsilon Epsilon value used in computation.
-  /// \param [in] factor Factor value used in running mean and variance
-  /// computation.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [in] summand_desc Summand memory descriptor.
-  /// \param [in] summand Pointer to summand data.
-  /// \param [in] scale_bias_desc Scale, bias memory descriptor.
-  /// \param [in] scale Pointer to scale data.
-  /// \param [in] bias Pointer to bias data.
-  /// \param [in] mean_var_desc Mean, variance memory descriptor.
-  /// \param [out] running_mean Pointer to running mean data.
-  /// \param [out] running_var Pointer to running variance data.
-  /// \param [out] saved_mean Pointer to optional cache to save mean data.
-  /// \param [out] saved_var Pointer to optional cache to save variance data.
-  /// \param [in] workspace_size Size of workspace.
-  /// \param [out] workspace Pointer to workspace generated from forward
-  /// propagation.
-  /// \returns An event representing the batch normalization forward operations.
-  sycl::event async_batch_normalization_forward_training(
-      batch_normalization_mode mode, batch_normalization_ops ops,
-      activation_desc &adesc, float epsilon, float factor, float alpha,
-      const memory_desc_ext &src_desc, void *src, float beta,
-      const memory_desc_ext &dst_desc, void *dst,
-      const memory_desc_ext &summand_desc, void *summand,
-      const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
-      const memory_desc_ext &mean_var_desc, void *running_mean, void *running_var,
-      void *saved_mean, void *saved_var, size_t workspace_size, void *workspace);
-
-  /// Computing the gradient of a specified batch normalization function asynchronously.
-  /// \param [in] mode Batch normalization mode.
-  /// \param [in] epsilon Epsilon value used in computation.
-  /// \param [in] alpha_data Value to scaling factors used to scale the computed
-  /// data value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] beta_data Value to scaling factors used to scale the prior value
-  /// in the data memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \param [in] alpha_param Value to scaling factors used to scale the computed
-  /// parameter value.
-  /// \param [in] diff_scale_bias_mean_var_desc Differential scale, bias, mean,
-  /// variance memory descriptor.
-  /// \param [in] scale Pointer to scale data.
-  /// \param [in] beta_param Value to scaling factors used to scale the prior value
-  /// in the parameter memory.
-  /// \param [in] diff_scale Pointer to differential scale data.
-  /// \param [in] diff_bias Pointer to differential bias data.
-  /// \param [in] saved_mean Pointer to optional cache saved mean data in forward.
-  /// \param [in] saved_var Pointer to optional cache saved variance data in forward.
-  /// \returns An event representing the batch normalization backward operations.
-  sycl::event async_batch_normalization_backward(
-      batch_normalization_mode mode, float epsilon, float alpha_data,
-      const memory_desc_ext &src_desc, void *src,
-      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
-      const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param,
-      const memory_desc_ext &diff_scale_bias_mean_var_desc, void *scale,
-      float beta_param, void *diff_scale, void *diff_bias, void *saved_mean,
-      void *saved_var);
-
-  /// Computing the gradient of a specified batch normalization function
-  /// asynchronously.
-  /// \param [in] mode Batch normalization mode.
-  /// \param [in] ops Batch normalization operation mode. This mode can set to
-  /// perform only batch normalization, or batch normalization followed by
-  /// activation, or batch normalization followed by element-wise addition and
-  /// activation.
-  /// \param [in] adesc Activation operation descriptor.
-  /// \param [in] epsilon Epsilon value used in computation.
-  /// \param [in] alpha_data Value to scaling factors used to scale the computed
-  /// data value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] beta_data Value to scaling factors used to scale the prior value
-  /// in the data memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \param [in] diff_summand_desc Differential summand memory descriptor.
-  /// \param [out] diff_summand Pointer to differential summand data.
-  /// \param [in] alpha_param Value to scaling factors used to scale the computed
-  /// parameter value.
-  /// \param [in] diff_scale_bias_mean_var_desc Differential scale, bias, mean,
-  /// variance memory descriptor.
-  /// \param [in] scale Pointer to scale data.
-  /// \param [in] bias Pointer to bias data.
-  /// \param [in] beta_param Value to scaling factors used to scale the prior value
-  /// in the parameter memory.
-  /// \param [out] diff_scale Pointer to differential scale data.
-  /// \param [out] diff_bias Pointer to differential bias data.
-  /// \param [in] saved_mean Pointer to optional cache saved mean data in forward.
-  /// \param [in] saved_var Pointer to optional cache saved variance data in forward.
-  /// \param [in] workspace_size Size of workspace.
-  /// \param [in] workspace Pointer to workspace used for backward propagation.
-  /// \returns An event representing the batch normalization backward operations.
-  sycl::event async_batch_normalization_backward(
-      batch_normalization_mode mode, batch_normalization_ops ops,
-      activation_desc &adesc, float epsilon, float alpha_data,
-      const memory_desc_ext &src_desc, void *src,
-      const memory_desc_ext &dst_desc, void *dst,
-      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
-      const memory_desc_ext &diff_src_desc, void *diff_src,
-      const memory_desc_ext &diff_summand_desc, void *diff_summand,
-      float alpha_param, const memory_desc_ext &diff_scale_bias_mean_var_desc,
-      void *scale, void *bias, float beta_param, void *diff_scale,
-      void *diff_bias, void *saved_mean, void *saved_var,
-      size_t workspace_size, void *workspace);
-
-  /// Computing the gradient of a specified batch normalization function
-  /// asynchronously.
-  /// \param [in] mode Batch normalization mode.
-  /// \param [in] ops Batch normalization operation mode. This mode can set to
-  /// perform only batch normalization, or batch normalization followed by
-  /// activation, or batch normalization followed by element-wise addition and
-  /// activation.
-  /// \param [in] adesc Activation operation descriptor.
-  /// \param [in] epsilon Epsilon value used in computation.
-  /// \param [in] alpha_data Value to scaling factors used to scale the computed
-  /// data value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] beta_data Value to scaling factors used to scale the prior value
-  /// in the data memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \param [in] diff_summand_desc Differential summand memory descriptor.
-  /// \param [out] diff_summand Pointer to differential summand data.
-  /// \param [in] alpha_param Value to scaling factors used to scale the computed
-  /// parameter value.
-  /// \param [in] diff_scale_bias_desc Differential scale, bias memory descriptor.
-  /// \param [in] scale Pointer to scale data.
-  /// \param [in] bias Pointer to bias data.
-  /// \param [in] beta_param Value to scaling factors used to scale the prior value
-  /// in the parameter memory.
-  /// \param [out] diff_scale Pointer to differential scale data.
-  /// \param [out] diff_bias Pointer to differential bias data.
-  /// \param [in] mean_var_desc Differential mean, variance memory descriptor.
-  /// \param [in] saved_mean Pointer to optional cache saved mean data in forward.
-  /// \param [in] saved_var Pointer to optional cache saved variance data in forward.
-  /// \param [in] workspace_size Size of workspace.
-  /// \param [in] workspace Pointer to workspace used for backward propagation.
-  /// \returns An event representing the batch normalization backward operations.
-  sycl::event async_batch_normalization_backward(
-      batch_normalization_mode mode, batch_normalization_ops ops,
-      activation_desc &adesc, float epsilon, float alpha_data,
-      const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
-      void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
-      float beta_data, const memory_desc_ext &diff_src_desc, void *diff_src,
-      const memory_desc_ext &diff_summand_desc, void *diff_summand,
-      float alpha_param, const memory_desc_ext &diff_scale_bias_desc, void *scale,
-      void *bias, float beta_param, void *diff_scale, void *diff_bias,
-      const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var,
-      size_t workspace_size, void *workspace);
-
-  /// Computing a specified convolution function value asynchronously.
-  /// \param [in] desc Convolution descriptor.
-  /// \param [in] alg Convolution algorithm.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] weight_desc Weight memory descriptor.
-  /// \param [in] weight Pointer to weight data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \returns An event representing the convolution forward operations.
-  sycl::event async_convolution_forward(convolution_desc &desc, ::dnnl::algorithm alg,
-                                  float alpha, const memory_desc_ext &src_desc,
-                                  void *src, const memory_desc_ext &weight_desc,
-                                  void *weight, float beta,
-                                  const memory_desc_ext &dst_desc, void *dst);
-
-  /// Computing a specified convolution function value asynchronously.
-  /// \param [in] desc Convolution descriptor.
-  /// \param [in] alg Convolution algorithm.
-  /// \param [in] adesc Activation operation descriptor.
-  /// \param [in] alpha_0 Value to scaling factors used to scale the data
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] weight_desc Weight memory descriptor.
-  /// \param [in] weight Pointer to weight data.
-  /// \param [in] alpha_1 Value to scaling factors used to scale the summand
-  /// value.
-  /// \param [in] summand_desc Summand memory descriptor.
-  /// \param [in] summand Pointer to summand data.
-  /// \param [in] bias_desc Bias memory descriptor.
-  /// \param [in] bias Pointer to bias data.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \returns An event representing the convolution forward operations.
-  sycl::event async_convolution_forward(
-      convolution_desc &desc, ::dnnl::algorithm alg, activation_desc &adesc,
-      float alpha_0, const memory_desc_ext &src_desc, void *src,
-      const memory_desc_ext &weight_desc, void *weight, float alpha_1,
-      const memory_desc_ext &summand_desc, void *summand,
-      const memory_desc_ext &bias_desc, void *bias,
-      const memory_desc_ext &dst_desc, void *dst);
-
-  /// Computing the data gradient of a specified convolution function asynchronously.
-  /// \param [in] desc Convolution descriptor.
-  /// \param [in] alg Convolution algorithm.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] weight_desc Weight memory descriptor.
-  /// \param [in] weight Pointer to weight data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \returns An event representing the convolution backward data operations.
-  sycl::event async_convolution_backward_data(
-      convolution_desc &desc, ::dnnl::algorithm alg, float alpha,
-      const memory_desc_ext &weight_desc, void *weight,
-      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
-      const memory_desc_ext &diff_src_desc, void *diff_src);
-
-  /// Computing the weight gradient of a specified convolution function
-  /// asynchronously.
-  /// \param [in] desc Convolution descriptor.
-  /// \param [in] alg Convolution algorithm.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] diff_weight_desc Differential weight memory descriptor.
-  /// \param [out] diff_weight Pointer to differential weight data.
-  /// \returns An event representing the convolution backward weight operations.
-  sycl::event async_convolution_backward_weight(
-      convolution_desc &desc, ::dnnl::algorithm alg, float alpha,
-      const memory_desc_ext &src_desc, void *src,
-      const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
-      const memory_desc_ext &diff_weight_desc, void *diff_weight);
-
-  /// Computing the bias gradient of a specified convolution function
-  /// asynchronously.
-  /// \param [in] alpha Value to scaling factors used to scale the computed
-  /// value.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] beta Value to scaling factors used to scale the prior value
-  /// in the destination memory.
-  /// \param [in] diff_bias_desc Differential bias memory descriptor.
-  /// \param [out] diff_bias Pointer to differential bias data.
-  /// \returns An event representing the convolution backward bias operations.
-  sycl::event async_convolution_backward_bias(float alpha,
-                                        const memory_desc_ext &diff_dst_desc,
-                                        void *diff_dst, float beta,
-                                        const memory_desc_ext &diff_bias_desc,
-                                        void *diff_bias);
-
-  /// Getting the required weight space size for specified rnn operation.  
-  /// \param [in] desc RNN descriptor.
-  /// \param [out] weight_space_size Size of required weight space.
-  void rnn_get_weight_space_size(const rnn_desc &desc,
-                                 size_t *weight_space_size);
-
-  /// Getting the required scratchpad size and workspace size for specified rnn operation.  
-  /// \param [in] desc RNN descriptor.
-  /// \param [in] kind Propagation kind.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [out] scratchpad_size Size of required scratchpad.
-  /// \param [out] workspace_size Size of required workspace.
-  void rnn_get_scratchpad_workspace_size(const rnn_desc &desc, ::dnnl::prop_kind kind,
-                              const memory_desc_ext &src_desc,
-                              size_t *scratchpad_size, size_t *workspace_size);
-
-  /// Computing a specified rnn function value asynchronously.
-  /// \param [in] desc RNN descriptor.
-  /// \param [in] kind Propagation kind.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [in] iter_desc Recurrent hidden state data memory descriptor.
-  /// \param [in] src_iter Pointer to input recurrent hidden state data.
-  /// \param [in] dst_iter Pointer to output recurrent hidden state data.
-  /// \param [in] iter_c_desc Recurrent cell state data memory descriptor.
-  /// \param [in] src_c_iter Pointer to input recurrent cell state data.
-  /// \param [in] dst_c_iter Pointer to output recurrent cell state data.
-  /// \param [in] weight_size Size of weight memory.
-  /// \param [in] weight Pointer to weight data.
-  /// \param [in] scratchpad_size Size of scratchpad memory.
-  /// \param [in] scratchpad Pointer to scratchpad data.
-  /// \param [in] workspace_size Size of workspace memory.
-  /// \param [in] workspace Pointer to workspace data.
-  /// \returns An event representing the status of rnn forward operations.
-  sycl::event async_rnn_forward(const rnn_desc &desc, ::dnnl::prop_kind kind,
-                               const memory_desc_ext &src_desc, void *src,
-                               const memory_desc_ext &dst_desc, void *dst,
-                               const memory_desc_ext &iter_desc, void *src_iter,
-                               void *dst_iter,
-                               const memory_desc_ext &iter_c_desc,
-                               void *src_iter_c, void *dst_iter_c,
-                               size_t weight_size, void *weight,
-                               size_t scratchpad_size, void *scratchpad,
-                               size_t workspace_size, void *workspace);
-
-  /// Computing the data and weight gradient of a specified rnn function
-  /// asynchronously.
-  /// \param [in] desc RNN descriptor.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [in] dst Pointer to destination data.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \param [in] iter_desc Recurrent hidden state data memory descriptor.
-  /// \param [in] src_iter Pointer to input recurrent hidden state data.
-  /// \param [in] diff_dst_iter Pointer to differential output recurrent hidden state data.
-  /// \param [out] diff_src_iter Pointer to differential input recurrent hidden state data.
-  /// \param [in] iter_c_desc Recurrent cell state data memory descriptor.
-  /// \param [in] src_c_iter Pointer to input recurrent cell state data.
-  /// \param [in] diff_dst_c_iter Pointer to differential output recurrent cell state data.
-  /// \param [out] diff_src_c_iter Pointer to differential input recurrent cell state data.
-  /// \param [in] weight_size Size of weight memory.
-  /// \param [in] weight Pointer to weight data.
-  /// \param [out] diff_weight Pointer to differential weight data.
-  /// \param [in] scratchpad_size Size of scratchpad memory.
-  /// \param [in] scratchpad Pointer to scratchpad data.
-  /// \param [in] workspace_size Size of workspace memory.
-  /// \param [in] workspace Pointer to workspace data.
-  /// \returns An event representing the status of rnn backward operations.
-  sycl::event async_rnn_backward(
-      const rnn_desc &desc, const memory_desc_ext &dst_desc, void *dst,
-      void *diff_dst, const memory_desc_ext &src_desc, void *src,
-      void *diff_src, const memory_desc_ext &iter_desc, void *src_iter,
-      void *diff_dst_iter, void *diff_src_iter,
-      const memory_desc_ext &iter_c_desc, void *src_iter_c,
-      void *diff_dst_iter_c, void *diff_src_iter_c, size_t weight_size,
-      void *weight, void *diff_weight, size_t scratchpad_size, void *scratchpad,
-      size_t workspace_size, void *workspace);
-
-  /// Getting the required state size for specified dropout operation.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \returns Required size of state.
-  size_t get_dropout_state_size();
-
-  /// Getting the required workspace size for dropout operation.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \returns Required size of workspace.
-  static size_t get_dropout_workspace_size(const memory_desc_ext &src_desc);
-
-  /// Computing a specified dropout function value asynchronously.
-  /// \param [in] desc Dropout descriptor.
-  /// \param [in] src_desc Source memory descriptor.
-  /// \param [in] src Pointer to source data.
-  /// \param [in] dst_desc Destination memory descriptor.
-  /// \param [out] dst Pointer to destination data.
-  /// \param [in] workspace Pointer to workspace data.
-  /// \param [in] workspace_size Size of workspace memory.
-  /// \returns An event representing the dropout forward operations.
-  sycl::event async_dropout_forward(dropout_desc &desc,
-                                    const memory_desc_ext &src_desc, void *src,
-                                    const memory_desc_ext &dst_desc, void *dst,
-                                    void *workspace, size_t workspace_size);
-
-  /// Computing the gradient of a specified dropout function asynchronously.
-  /// \param [in] desc Dropout descriptor.
-  /// \param [in] diff_dst_desc Differential destination memory descriptor.
-  /// \param [in] diff_dst Pointer to differential destination data.
-  /// \param [in] diff_src_desc Differential source memory descriptor.
-  /// \param [out] diff_src Pointer to differential source data.
-  /// \param [in] workspace Pointer to workspace data.
-  /// \param [in] workspace_size Size of workspace memory.
-  /// \returns An event representing the dropout backward operations.
-  sycl::event async_dropout_backward(dropout_desc &desc,
-                                     const memory_desc_ext &diff_dst_desc,
-                                     void *diff_dst,
-                                     const memory_desc_ext &diff_src_desc,
-                                     void *diff_src, void *workspace,
-                                     size_t workspace_size);
-};
-
-inline thread_local unsigned int engine_ext::_engine_count;
-inline thread_local detail::primitive_cache engine_ext::_primitive_cache;
-inline thread_local std::map<void *, ::dnnl::memory> engine_ext::_workspace_map;
-inline thread_local std::map<sycl::queue *,
-                             std::shared_ptr<engine_ext::internal_resource>>
-    engine_ext::_internal_resource_cache;
-
-inline
-void dropout_desc::restore(engine_ext &engine, float p, void *state,
-                                  size_t state_size, unsigned long long seed) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-  if (state) {
-    std::int64_t required_state_size = engine.get_dropout_state_size();
-    if (state_size < required_state_size) {
-      throw std::runtime_error("restore: state_size less than required state size.");
-    }
-    sycl::queue *q = engine.get_queue();
-    _imp->_p = p;
-    _imp->_seed = seed;
-    _imp->_state = state;
-    _imp->_host_state = std::vector<std::uint8_t>(required_state_size);
-    q->memcpy(_imp->_host_state.data(), _imp->_state, required_state_size).wait();
-    _imp->_rng_engine =
-        oneapi::mkl::rng::load_state<rng_engine_t>(
-            *q, _imp->_host_state.data());
-  }
-#endif
-}
-
-inline
-void dropout_desc::set(engine_ext &engine, float p, void *state,
-                              size_t state_size, unsigned long long seed) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-  _imp->_p = p;
-  if (state) {
-    std::int64_t required_state_size = engine.get_dropout_state_size();
-    if (state_size < required_state_size) {
-      throw std::runtime_error("set: no sufficient memory to save states.");
-    }
-    sycl::queue *q = engine.get_queue();
-    _imp->_seed = seed;
-    _imp->_state = state;
-    _imp->_host_state = std::vector<std::uint8_t>(required_state_size);
-    _imp->_rng_engine = rng_engine_t(*q, seed);
-    oneapi::mkl::rng::save_state(_imp->_rng_engine, _imp->_host_state.data());
-    q->memcpy(_imp->_state, _imp->_host_state.data(), required_state_size).wait();
-  }
-#endif
-}
-
-inline
-::dnnl::memory::data_type
-memory_desc_ext::to_dnnl_data_type(dpct::library_data_t dt) {
-  using dnnl_dt = ::dnnl::memory::data_type;
-  switch (dt) {
-  case dpct::library_data_t::real_half:
-    return dnnl_dt::f16;
-  case dpct::library_data_t::real_bfloat16:
-    return dnnl_dt::bf16;
-  case dpct::library_data_t::real_float:
-    return dnnl_dt::f32;
-  case dpct::library_data_t::real_double:
-    return dnnl_dt::f64;
-  case dpct::library_data_t::real_int32:
-    return dnnl_dt::s32;
-  case dpct::library_data_t::real_int8:
-    return dnnl_dt::s8;
-  case dpct::library_data_t::real_uint8:
-    return dnnl_dt::u8;
-  case dpct::library_data_t::real_int8_4:
-    return dnnl_dt::s8;
-  case dpct::library_data_t::real_int8_32:
-    return dnnl_dt::s8;
-  case dpct::library_data_t::real_uint8_4:
-    return dnnl_dt::u8;
-  default:
-    throw std::runtime_error("to_dnnl_data_type: unsupported data type.");
-  }
-}
-
-inline
-dpct::library_data_t
-memory_desc_ext::to_dpct_library_data_t(::dnnl::memory::data_type dt,
-                                        unsigned block_size) {
-  using dpct_dt = dpct::library_data_t;
-  using dnnl_dt = ::dnnl::memory::data_type;
-  switch (dt) {
-  case dnnl_dt::f16:
-    return dpct_dt::real_half;
-  case dnnl_dt::bf16:
-    return dpct_dt::real_bfloat16;
-  case dnnl_dt::f32:
-    return dpct_dt::real_float;
-  case dnnl_dt::f64:
-    return dpct_dt::real_double;
-  case dnnl_dt::s32:
-    return dpct_dt::real_int32;
-  case dnnl_dt::s8:
-    if (block_size == 4) {
-      return dpct_dt::real_int8_4;
-    } else if (block_size == 32) {
-      return dpct_dt::real_int8_32;
-    } else {
-      return dpct_dt::real_int8;
-    }
-  case dnnl_dt::u8:
-    if (block_size == 4) {
-      return dpct_dt::real_uint8_4;
-    } else {
-      return dpct_dt::real_uint8;
-    }
-  default:
-    throw std::runtime_error("to_dpct_library_data_t: unsupported data type "
-                             "dnnl::memory::data_type::undef.");
-  }
-}
-
-inline
-::dnnl::memory::format_tag
-memory_desc_ext::to_dnnl_format_tag(dpct::library_data_t dt,
-                                    memory_format_tag tag) {
-  using dpct_dt = dpct::library_data_t;
-  using dpct_tag = memory_format_tag;
-  using dnnl_tag = ::dnnl::memory::format_tag;
-  switch (tag) {
-  case dpct_tag::nchw:
-    return dnnl_tag::nchw;
-  case dpct_tag::nhwc:
-    return dnnl_tag::nhwc;
-  default:
-    if (dt == dpct_dt::real_int8_32) {
-      return dnnl_tag::nChw32c;
-    } else {
-      return dnnl_tag::nChw4c;
-    }
-  }
-}
-
-inline
-void memory_desc_ext::set(memory_format_tag tag, dpct::library_data_t dt, int n,
-                          int c, int h, int w) {
-  _desc = ::dnnl::memory::desc({n, c, h, w}, to_dnnl_data_type(dt),
-                               to_dnnl_format_tag(dt, tag));
-}
-
-inline
-void memory_desc_ext::set(dpct::library_data_t dt, int n, int c, int h, int w,
-                          int n_stride, int c_stride, int h_stride,
-                          int w_stride) {
-  _desc = ::dnnl::memory::desc({n, c, h, w}, to_dnnl_data_type(dt),
-                               {n_stride, c_stride, h_stride, w_stride});
-}
-
-inline
-void memory_desc_ext::set(dpct::library_data_t dt, int ndims, const int dims[],
-                          const int strides[]) {
-  _desc = ::dnnl::memory::desc({dims, dims + ndims}, to_dnnl_data_type(dt),
-                               {strides, strides + ndims});
-}
-
-inline
-void memory_desc_ext::set(memory_format_tag tag, dpct::library_data_t dt,
-                          int ndims, const int dims[]) {
-  _desc = ::dnnl::memory::desc({dims, dims + ndims}, to_dnnl_data_type(dt),
-                               to_dnnl_format_tag(dt, tag));
-}
-
-inline
-void memory_desc_ext::set(rnn_memory_format_tag tag, dpct::library_data_t dt,
-                          int t, int n, int c) {
-  if (tag == rnn_memory_format_tag::tnc) {
-    _desc = ::dnnl::memory::desc({t, n, c}, to_dnnl_data_type(dt),
-                                 ::dnnl::memory::format_tag::tnc);
-  } else if(tag == rnn_memory_format_tag::ntc) {
-    _desc = ::dnnl::memory::desc({t, n, c}, to_dnnl_data_type(dt),
-                                 ::dnnl::memory::format_tag::ntc);
-  } else {
-    throw std::runtime_error("set: unsupported memory format tag.");
-  }
-}
-
-inline
-void memory_desc_ext::get(dpct::library_data_t *dt, int *n, int *c, int *h,
-                          int *w, int *n_stride, int *c_stride, int *h_stride,
-                          int *w_stride) const {
-  unsigned block_size = 1;
-  auto dims = _desc.get_dims();
-  auto inner_blks = _desc.get_inner_blks();
-  auto strides = _desc.get_strides();
-  if (!inner_blks.empty()) {
-    block_size = inner_blks[0];
-  }
-
-  *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size);
-  *n = dims[0];
-  *c = dims[1];
-  *h = dims[2];
-  *w = dims[3];
-  *n_stride = strides[0] / block_size;
-  *c_stride = strides[1] / block_size;
-  *h_stride = strides[2] / block_size;
-  *w_stride = strides[3] / block_size;
-}
-
-inline
-void memory_desc_ext::get(dpct::library_data_t *dt, memory_format_tag *tag,
-                          int *n, int *c, int *h, int *w) const {
-  unsigned block_size = 1;
-  *tag = memory_format_tag::nchw;
-  auto dims = _desc.get_dims();
-  auto strides = _desc.get_strides();
-  auto inner_blks = _desc.get_inner_blks();
-  if (!inner_blks.empty()) {
-    block_size = inner_blks[0];
-    *tag = memory_format_tag::nchw_blocked;
-  }
-  if (strides[1] == 1 && dims[1] != 1) {
-    *tag = memory_format_tag::nhwc;
-  }
-  *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size);
-  *n = dims[0];
-  *c = dims[1];
-  *h = dims[2];
-  *w = dims[3];
-}
-
-inline
-void memory_desc_ext::get(dpct::library_data_t *dt, rnn_memory_format_tag *tag,
-                          int *t, int *n, int *c) const {
-  auto dims = _desc.get_dims();
-  auto strides = _desc.get_strides();
-
-  if (strides[0] >= strides[1]) {
-    *tag = rnn_memory_format_tag::tnc;
-  } else {
-    *tag = rnn_memory_format_tag::ntc;
-  }
-
-  *dt = to_dpct_library_data_t(_desc.get_data_type(), 1);
-  *t = dims[0];
-  *n = dims[1];
-  *c = dims[2];
-}
-
-inline
-void memory_desc_ext::get(int requested_ndims, dpct::library_data_t *dt,
-                          int *ndims, int dims[], int strides[]) const {
-  unsigned block_size = 1;
-  auto inner_blks = _desc.get_inner_blks();
-  auto adims = _desc.get_dims();
-  auto astrides = _desc.get_strides();
-  if (!inner_blks.empty()) {
-    block_size = inner_blks[0];
-  }
-  *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size);
-  *ndims = _desc.get_ndims();
-  for (int index = 0; index < requested_ndims; index++) {
-    dims[index] = adims[index];
-    strides[index] =
-        astrides[index] / block_size;
-  }
-}
-
-inline
-void memory_desc_ext::get(int requested_ndims, dpct::library_data_t *dt,
-                          memory_format_tag *tag, int *ndims,
-                          int dims[]) const {
-  unsigned block_size = 1;
-  *tag = memory_format_tag::nchw;
-  auto inner_blks = _desc.get_inner_blks();
-  auto adims = _desc.get_dims();
-  auto astrides = _desc.get_strides();
-  if (!inner_blks.empty()) {
-    block_size = inner_blks[0];
-    *tag = memory_format_tag::nchw_blocked;
-  }
-  if (astrides[1] == 1 &&
-      adims[1] != 1) {
-    *tag = memory_format_tag::nhwc;
-  }
-  *dt = to_dpct_library_data_t(_desc.get_data_type(), block_size);
-  *ndims = _desc.get_ndims();
-  for (int index = 0; index < requested_ndims; index++) {
-    dims[index] = adims[index];
-  }
-}
-
-inline
-void engine_ext::get_rnn_configuration(const ::dnnl::memory::desc &desc,
-                                       rnn_direction direction, rnn_mode mode,
-                                       dpct::library_data_t dt, int hidden_size,
-                                       ::dnnl::memory::data_type *dnnl_dt,
-                                       ::dnnl::memory::format_tag *tag,
-                                       int *projection_size, int *output_size,
-                                       int *seq_length, int *batch_size,
-                                       int *direction_num, int *gate_num) {
-  if (!desc.is_zero()) {
-    auto dims = desc.get_dims();
-    auto strides = desc.get_strides();
-    if (strides[0] >= strides[1]) {
-      *tag = ::dnnl::memory::format_tag::tnc;
-      *seq_length = dims[0];
-      *batch_size = dims[1];
-    } else {
-      *tag = ::dnnl::memory::format_tag::ntc;
-      *seq_length = dims[1];
-      *batch_size = dims[0];
-    }
-  }
-  if (direction == rnn_direction::bidirectional) {
-    *direction_num = 2;
-  } else {
-    *direction_num = 1;
-  }
-  if (mode == rnn_mode::lstm) {
-    *gate_num = 4;
-  } else if (mode == rnn_mode::gru) {
-    *gate_num = 3;
-  } else {
-    *gate_num = 1;
-  }
-  if (*projection_size != hidden_size) {
-    *output_size = *projection_size;
-  } else {
-    *projection_size = 0;
-    *output_size = hidden_size;
-  }
-  *dnnl_dt = memory_desc_ext::to_dnnl_data_type(dt);
-}
-
-inline
-void *engine_ext::allocate(const memory_desc_ext &data_desc, int count) {
-  return allocate(data_desc.get_size() * count);
-}
-
-inline
-void *engine_ext::allocate(size_t size) {
-  auto &Info = get_internal_resource(_q)->binfo;
-  uint8_t *result = Info.buffer + Info.usage;
-  Info.usage += size;
-  return result;
-}
-
-inline
-void engine_ext::transform_no_zero(const memory_desc_ext &desc, void *src, void *dst) {
-  ::dnnl::memory::data_type dt = desc.get_desc().get_data_type();
-  size_t element_num = desc.get_element_num();
-  switch (dt) {
-  case ::dnnl::memory::data_type::f32:
-    transform_no_zero_with_type<float>(_q, src, dst, element_num);
-    break;
-  case ::dnnl::memory::data_type::f16:
-    transform_no_zero_with_type<sycl::half>(_q, src, dst, element_num);
-    break;
-  case ::dnnl::memory::data_type::s32:
-    transform_no_zero_with_type<int32_t>(_q, src, dst, element_num);
-    break;
-  case ::dnnl::memory::data_type::s8:
-    transform_no_zero_with_type<int8_t>(_q, src, dst, element_num);
-    break;
-  case ::dnnl::memory::data_type::u8:
-    transform_no_zero_with_type<uint8_t>(_q, src, dst, element_num);
-    break;
-  default:
-    throw std::runtime_error("transform_no_zero: unsupported data type.");
-  }
-}
-
-inline
-::dnnl::memory::desc
-engine_ext::get_group_weight_desc(int group_count,
-                                  const memory_desc_ext &weight_desc) {
-  if (group_count == 1) {
-    return weight_desc.get_desc();
-  }
-  auto help_weight_desc = weight_desc.get_desc();
-  int ndims = help_weight_desc.get_ndims();
-  if (!help_weight_desc.get_inner_blks().empty()) {
-    throw std::runtime_error("get_group_weight_desc: group convolution with "
-                             "blocked weight memory unimplemented.");
-  }
-  std::vector<int64_t> new_size;
-  auto old_size = weight_desc.get_dims();
-  new_size.push_back(group_count);
-  new_size.push_back(old_size[0] / group_count);
-  for (int index = 1; index < old_size.size(); index++) {
-    new_size.push_back(old_size[index]);
-  }
-  std::vector<int64_t> strides = help_weight_desc.get_strides();
-  ::dnnl::memory::format_tag tag;
-  bool is_nhwc = (strides[1] == 1 && old_size[1] != 1);
-
-  if (ndims == 4) {
-    if (is_nhwc) {
-      tag = ::dnnl::memory::format_tag::gohwi;
-    } else {
-      tag = ::dnnl::memory::format_tag::goihw;
-    }
-  } else if (ndims == 5) {
-    if (is_nhwc) {
-      tag = ::dnnl::memory::format_tag::godhwi;
-    } else {
-      tag = ::dnnl::memory::format_tag::goidhw;
-    }
-  }
-
-  help_weight_desc =
-      ::dnnl::memory::desc(new_size, weight_desc.get_desc().get_data_type(), tag);
-  return help_weight_desc;
-}
-
-inline
-::dnnl::memory::desc engine_ext::compress_spatial_dimensions_to_channel(
-    const ::dnnl::memory::desc &desc) {
-  int ndims = desc.get_ndims();
-  auto dims = desc.get_dims();
-  auto inner_blks = desc.get_inner_blks();
-  assert(ndims >= 4 && "ndims is at least 4.");
-  std::vector<int64_t> compressed_dims(ndims);
-  compressed_dims[0] = dims[0];
-  compressed_dims[1] = dims[1];
-  for (int index = 2; index < ndims; index++) {
-    compressed_dims[1] = compressed_dims[1] * dims[index];
-    compressed_dims[index] = 1;
-  }
-  if (!inner_blks.empty() && inner_blks[0] == 4) {
-    return ::dnnl::memory::desc(compressed_dims, desc.get_data_type(),
-                                ::dnnl::memory::format_tag::nChw4c);
-  } else if (!inner_blks.empty() && inner_blks[0] == 32) {
-    return ::dnnl::memory::desc(compressed_dims, desc.get_data_type(),
-                                ::dnnl::memory::format_tag::nChw32c);
-  }
-  std::vector<int64_t> strides(ndims, 1);
-  strides[0] = compressed_dims[1];
-
-  return ::dnnl::memory::desc(compressed_dims, desc.get_data_type(), strides);
-}
-
-inline
-::dnnl::memory::desc
-engine_ext::get_bn_scale_bias_mean_var_desc(const ::dnnl::memory::desc &desc,
-                                            batch_normalization_mode mode) {
-  int ndims = desc.get_ndims();
-  auto dims = desc.get_dims();
-  assert(ndims >= 4 && "ndims is at least 4.");
-  int channel_num = 1;
-  if (mode == batch_normalization_mode::spatial) {
-    channel_num = dims[1];
-  } else {
-    for (int index = 1; index < ndims; index++) {
-      channel_num = channel_num * dims[index];
-    }
-  }
-  return ::dnnl::memory::desc({channel_num}, desc.get_data_type(),
-                              ::dnnl::memory::format_tag::a);
-}
-
-inline
-::dnnl::memory::desc engine_ext::transfer_memory_desc_to_channel_major_format(
-    const ::dnnl::memory::desc &desc) {
-  if (!desc.get_inner_blks().empty()) {
-    return desc;
-  }
-  int ndims = desc.get_ndims();
-  auto dims = desc.get_dims();
-  if (ndims == 4) {
-    return ::dnnl::memory::desc(dims, desc.get_data_type(),
-                                ::dnnl::memory::format_tag::nchw);
-  }
-  return ::dnnl::memory::desc(dims, desc.get_data_type(),
-                              ::dnnl::memory::format_tag::ncdhw);
-}
-
-/// If the alpha = 0 and beta = 1, then the destination (dst = alpha * out +
-/// beta * prior_dst) have no change. In this case this function returns true
-/// means the operation can exit directly.
-inline
-bool engine_ext::scale_parameter_preprocess(
-    const std::vector<output_argument_info> &args) {
-  bool direct_exit = true;
-  for (auto &arg : args) {
-    if (arg._alpha == 0.f) {
-      if (arg._beta != 1.f) {
-        async_scale(arg._beta, arg._desc, arg._data);
-      }
-    } else {
-      direct_exit = false;
-    }
-  }
-  return direct_exit;
-}
-
-inline
-void engine_ext::derive_batch_normalization_memory_desc(
-    memory_desc_ext &scale_bias_desc, memory_desc_ext &mean_var_desc,
-    const memory_desc_ext &src_desc, batch_normalization_mode mode) {
-    derive_batch_normalization_memory_desc(scale_bias_desc, src_desc, mode);
-    derive_batch_normalization_memory_desc(mean_var_desc, src_desc, mode);
-}
-
-inline
-void engine_ext::derive_batch_normalization_memory_desc(
-    memory_desc_ext &desc, const memory_desc_ext &src_desc,
-    batch_normalization_mode mode) {
-  int src_ndims = src_desc.get_desc().get_ndims();
-  auto inner_blks = src_desc.get_desc().get_inner_blks();
-  if (src_desc.get_desc().get_ndims() != 4 ||
-      src_desc.get_desc().get_ndims() != 5) {
-    throw std::runtime_error("derive_batch_normalization_memory_desc: only 4d "
-                             "and 5d memory descriptor supported.");
-  }
-  std::vector<int64_t> dims = src_desc.get_dims();
-  dims[0] = 1;
-  if (mode == batch_normalization_mode::spatial) {
-    dims[2] = 1;
-    dims[3] = 1;
-    if (src_ndims == 5) {
-      dims[4] = 1;
-    }
-  }
-  auto data_type = src_desc.get_desc().get_data_type();
-  if (data_type == ::dnnl::memory::data_type::f16) {
-    data_type = ::dnnl::memory::data_type::f32;
-  }
-  if (!inner_blks.empty() && inner_blks[0] == 4) {
-    desc.set_desc(::dnnl::memory::desc(dims, data_type,
-                                       ::dnnl::memory::format_tag::nChw4c));
-  } else if (!inner_blks.empty() && inner_blks[0] == 32) {
-    desc.set_desc(::dnnl::memory::desc(dims, data_type,
-                                       ::dnnl::memory::format_tag::nChw32c));
-  } else {
-    if (src_ndims == 4) {
-      desc.set_desc(::dnnl::memory::desc(dims, data_type,
-                                         ::dnnl::memory::format_tag::nchw));
-    } else {
-      desc.set_desc(::dnnl::memory::desc(dims, data_type,
-                                         ::dnnl::memory::format_tag::ncdhw));
-    }
-  }
-}
-
-template <typename primitive_type>
-sycl::event engine_ext::execute_primitive(
-    const std::pair<detail::primitive_cache_key_type, detail::primitive_and_args>
-        &primitive,
-    const std::vector<output_argument_info> &output_args) {
-  std::vector<void *> caches;
-  int output_arg_num = output_args.size();
-  for (int i = 0; i < output_arg_num; i++) {
-    if (output_args[i]._beta != 0.f) {
-      auto cache = allocate(output_args[i]._desc);
-      caches.push_back(cache);
-      (*primitive.second.args)[output_args[i]._name].set_data_handle(cache);
-    }
-  }
-
-  auto e = ::dnnl::sycl_interop::execute(
-      *(static_cast<primitive_type *>(primitive.second.primitive)), *_s,
-      *primitive.second.args);
-  _primitive_cache.put(
-      primitive.first, primitive.second.primitive, primitive.second.args,
-      [](::dnnl::primitive *p) { delete static_cast<primitive_type *>(p); }, e,
-      _q);
-  int cache_index = 0;
-  for (int i = 0; i < output_arg_num; i++) {
-    if (output_args[i]._beta != 0.f) {
-      e = async_sum(output_args[i]._alpha, output_args[i]._desc,
-                    caches[cache_index++], output_args[i]._beta,
-                    output_args[i]._desc, output_args[i]._data);
-    } else {
-      if (output_args[i]._alpha != 1.f) {
-        e = async_scale(output_args[i]._alpha, output_args[i]._desc,
-                        output_args[i]._data);
-      }
-    }
-  }
-  return e;
-}
-
-inline
-::dnnl::memory::desc engine_ext::bn_reorder_memory_to_channel_major_format(
-    bool is_input, ::dnnl::memory::desc &desc, void *src, void **cache) {
-  ::dnnl::memory::desc result;
-  result = transfer_memory_desc_to_channel_major_format(desc);
-  if ((result != desc) || !src) {
-    *cache = allocate(desc);
-    if (is_input && src) {
-      async_reorder(1.f, desc, src, 0.f, result, *cache);
-    }
-  }
-  return result;
-}
-
-inline
-sycl::event engine_ext::batch_normalization_backward_internal(
-    batch_normalization_mode mode, float epsilon, float alpha_data,
-    const memory_desc_ext &src_desc, void *src,
-    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
-    const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param,
-    const memory_desc_ext &diff_scale_bias_desc, void *scale, void *bias,
-    float beta_param, void *diff_scale, void *diff_bias,
-    const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var) {
-  if (scale_parameter_preprocess(
-          {{alpha_data, beta_data, diff_src_desc, diff_src},
-           {alpha_param, beta_param, diff_scale_bias_desc, diff_scale},
-           {alpha_param, beta_param, diff_scale_bias_desc, diff_bias}})) {
-    return sycl::event();
-  }
-
-  void *reordered_src = nullptr, *reordered_diff_dst = nullptr,
-       *reordered_diff_src = nullptr, *reordered_scale = nullptr,
-       *reordered_bias = nullptr, *reordered_diff_scale = nullptr,
-       *reordered_diff_bias = nullptr, *reordered_saved_mean = nullptr,
-       *reordered_saved_var = nullptr;
-
-  ::dnnl::memory::desc help_src_desc = src_desc.get_desc();
-  ::dnnl::memory::desc help_diff_dst_desc = diff_dst_desc.get_desc();
-  ::dnnl::memory::desc help_diff_src_desc = diff_src_desc.get_desc();
-  ::dnnl::memory::desc help_diff_scale_bias_desc =
-      diff_scale_bias_desc.get_desc();
-  ::dnnl::memory::desc help_mean_var_desc = mean_var_desc.get_desc();
-  ::dnnl::memory::desc actual_diff_src_desc = help_diff_src_desc;
-  ::dnnl::memory::desc actual_diff_scale_bias_desc = help_diff_scale_bias_desc;
-  enter_primitive(
-      help_diff_scale_bias_desc.get_size() * 14 + help_src_desc.get_size() * 2 +
-      help_diff_dst_desc.get_size() * 7 + help_diff_src_desc.get_size() * 5 +
-      help_mean_var_desc.get_size() * 13);
-  if (mode == batch_normalization_mode::per_activation) {
-    help_src_desc = bn_reorder_memory_to_channel_major_format(true, help_src_desc, src,
-                                                       &reordered_src);
-    help_diff_dst_desc = bn_reorder_memory_to_channel_major_format(
-        true, help_diff_dst_desc, diff_dst, &reordered_diff_dst);
-    help_diff_src_desc = bn_reorder_memory_to_channel_major_format(
-        false, help_diff_src_desc, diff_src, &reordered_diff_src);
-    actual_diff_src_desc = help_diff_src_desc;
-    help_diff_scale_bias_desc = bn_reorder_memory_to_channel_major_format(
-        true, help_diff_scale_bias_desc, scale, &reordered_scale);
-    actual_diff_scale_bias_desc = help_diff_scale_bias_desc;
-    if (bias) {
-      bn_reorder_memory_to_channel_major_format(true, help_diff_scale_bias_desc, bias,
-                                         &reordered_bias);
-    }
-    bn_reorder_memory_to_channel_major_format(false, help_diff_scale_bias_desc,
-                                       diff_scale, &reordered_diff_scale);
-    bn_reorder_memory_to_channel_major_format(false, help_diff_scale_bias_desc,
-                                       diff_bias, &reordered_diff_bias);
-
-    help_mean_var_desc = bn_reorder_memory_to_channel_major_format(
-        true, help_mean_var_desc, saved_mean, &reordered_saved_mean);
-    bn_reorder_memory_to_channel_major_format(true, help_mean_var_desc, saved_var,
-                                       &reordered_saved_var);
-    help_src_desc = compress_spatial_dimensions_to_channel(help_src_desc);
-    help_diff_src_desc =
-        compress_spatial_dimensions_to_channel(help_diff_src_desc);
-    help_diff_dst_desc =
-        compress_spatial_dimensions_to_channel(help_diff_dst_desc);
-  } else {
-    if ((help_src_desc != help_diff_dst_desc) ||
-        (help_src_desc != help_diff_src_desc) ||
-        (help_diff_dst_desc != help_diff_src_desc)) {
-      help_src_desc = bn_reorder_memory_to_channel_major_format(
-          true, help_src_desc, src, &reordered_src);
-      help_diff_dst_desc = bn_reorder_memory_to_channel_major_format(
-          true, help_diff_dst_desc, diff_dst, &reordered_diff_dst);
-      help_diff_src_desc = bn_reorder_memory_to_channel_major_format(
-          false, help_diff_src_desc, diff_src, &reordered_diff_src);
-      actual_diff_src_desc = help_diff_src_desc;
-    }
-  }
-
-  help_diff_scale_bias_desc =
-      get_bn_scale_bias_mean_var_desc(help_diff_scale_bias_desc, mode);
-  help_mean_var_desc =
-      get_bn_scale_bias_mean_var_desc(help_mean_var_desc, mode);
-
-  auto forward_primitive =
-      create_primitive_desc<::dnnl::batch_normalization_forward>(
-          ::dnnl::prop_kind::forward_training, help_src_desc,
-          help_diff_dst_desc, epsilon,
-          ::dnnl::normalization_flags::use_scale |
-              ::dnnl::normalization_flags::use_shift);
-  auto primitive_args =
-      create_primitive_args_or_get<::dnnl::batch_normalization_backward>(
-          ::dnnl::prop_kind::backward, help_diff_src_desc, help_diff_dst_desc,
-          help_src_desc, epsilon,
-          ::dnnl::normalization_flags::use_scale |
-              ::dnnl::normalization_flags::use_shift, forward_primitive);
-
-  void *dst_cache = nullptr;
-  if (!saved_mean && !saved_var) {
-    dst_cache = allocate(diff_dst_desc);
-    if (!reordered_saved_mean) {
-      reordered_saved_mean = allocate(mean_var_desc);
-    }
-    if (!reordered_saved_var) {
-      reordered_saved_var = allocate(mean_var_desc);
-    }
-    if (!bias) {
-      _q->fill(reordered_bias, 0, diff_scale_bias_desc.get_size());
-    }
-
-    batch_normalization_forward_internal(
-        true, mode, epsilon, 0.f, 1.f, src_desc, src, 0.f, diff_dst_desc,
-        dst_cache, diff_scale_bias_desc, scale, bias ? bias : reordered_bias,
-        mean_var_desc, reordered_saved_mean, reordered_saved_var, nullptr,
-        nullptr);
-  }
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, help_src_desc,
-             reordered_src ? reordered_src : src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_SCALE,
-             help_diff_scale_bias_desc,
-             reordered_scale ? reordered_scale : scale);
-  insert_arg(primitive_args.second.args, DNNL_ARG_MEAN, help_mean_var_desc,
-             reordered_saved_mean ? reordered_saved_mean : saved_mean);
-  insert_arg(primitive_args.second.args, DNNL_ARG_VARIANCE, help_mean_var_desc,
-             reordered_saved_var ? reordered_saved_var : saved_var);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, help_diff_src_desc,
-             reordered_diff_dst ? reordered_diff_dst : diff_dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, help_diff_src_desc,
-             reordered_diff_src ? reordered_diff_src : diff_src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SCALE,
-             help_diff_scale_bias_desc,
-             reordered_diff_scale ? reordered_diff_scale : diff_scale);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SHIFT,
-             help_diff_scale_bias_desc,
-             reordered_diff_bias ? reordered_diff_bias : diff_bias);
-
-  sycl::event e = execute_primitive<::dnnl::batch_normalization_backward>(
-      primitive_args,
-      {{alpha_data, beta_data, DNNL_ARG_DIFF_SRC, help_diff_src_desc,
-        reordered_diff_src ? reordered_diff_src : diff_src},
-       {alpha_param, beta_param, DNNL_ARG_DIFF_SCALE, help_diff_scale_bias_desc,
-        reordered_diff_scale ? reordered_diff_scale : diff_scale},
-       {alpha_param, beta_param, DNNL_ARG_DIFF_SHIFT, help_diff_scale_bias_desc,
-        reordered_diff_bias ? reordered_diff_bias : diff_bias}});
-  if (actual_diff_src_desc != diff_src_desc.get_desc() && reordered_diff_src) {
-    e = async_reorder(1.f, actual_diff_src_desc, reordered_diff_src, 0.f,
-                diff_src_desc, diff_src);
-  }
-  if (actual_diff_scale_bias_desc != diff_scale_bias_desc.get_desc() &&
-      reordered_diff_scale && reordered_diff_bias) {
-    async_reorder(1.f, actual_diff_scale_bias_desc, reordered_diff_scale, 0.f,
-            diff_scale_bias_desc, diff_scale);
-    e = async_reorder(1.f, actual_diff_scale_bias_desc, reordered_diff_bias, 0.f,
-                diff_scale_bias_desc, diff_bias);
-  }
-  return exit_primitive(e);
-}
-
-inline
-sycl::event engine_ext::batch_normalization_forward_internal(
-    bool is_infer, batch_normalization_mode mode, float epsilon, float factor,
-    float alpha, const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &dst_desc, void *dst,
-    const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
-    const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var,
-    void *running_mean, void *running_var) {
-  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
-    return sycl::event();
-  }
-  enter_primitive(src_desc.get_size() + 5 * dst_desc.get_size() +
-                  scale_bias_desc.get_size() * 2 +
-                  mean_var_desc.get_size() * 9);
-  void *reordered_src = nullptr, *reordered_dst = nullptr,
-       *reordered_scale = nullptr, *reordered_bias = nullptr,
-       *reordered_saved_mean = nullptr, *reordered_saved_var = nullptr;
-  ::dnnl::memory::desc help_src_desc = src_desc.get_desc();
-  ::dnnl::memory::desc help_dst_desc = dst_desc.get_desc();
-  ::dnnl::memory::desc help_scale_bias_desc = scale_bias_desc.get_desc();
-  ::dnnl::memory::desc help_mean_var_desc = mean_var_desc.get_desc();
-  ::dnnl::memory::desc actual_dst_desc = help_dst_desc;
-  ::dnnl::memory::desc actual_mean_var_desc = help_mean_var_desc;
-
-  if (mode == batch_normalization_mode::per_activation) {
-    help_src_desc = bn_reorder_memory_to_channel_major_format(true, help_src_desc, src,
-                                                       &reordered_src);
-    help_dst_desc = bn_reorder_memory_to_channel_major_format(
-        false, help_dst_desc, dst, &reordered_dst);
-    actual_dst_desc = help_dst_desc;
-    help_scale_bias_desc = bn_reorder_memory_to_channel_major_format(
-        true, help_scale_bias_desc, scale, &reordered_scale);
-    bn_reorder_memory_to_channel_major_format(true, help_scale_bias_desc, bias,
-                                       &reordered_bias);
-    help_mean_var_desc = bn_reorder_memory_to_channel_major_format(
-        is_infer, help_mean_var_desc, saved_mean,
-        &reordered_saved_mean);
-    actual_mean_var_desc = help_mean_var_desc;
-    bn_reorder_memory_to_channel_major_format(is_infer,
-                                       help_mean_var_desc, saved_var,
-                                       &reordered_saved_var);
-    help_src_desc = compress_spatial_dimensions_to_channel(help_src_desc);
-    help_dst_desc = compress_spatial_dimensions_to_channel(help_dst_desc);
-  } else {
-    if (help_src_desc != help_dst_desc) {
-      help_src_desc = bn_reorder_memory_to_channel_major_format(
-          true, help_src_desc, src, &reordered_src);
-      help_dst_desc = bn_reorder_memory_to_channel_major_format(
-          false, help_dst_desc, dst, &reordered_dst);
-      actual_dst_desc = help_dst_desc;
-    }
-  }
-  help_scale_bias_desc =
-      get_bn_scale_bias_mean_var_desc(help_scale_bias_desc, mode);
-  help_mean_var_desc =
-      get_bn_scale_bias_mean_var_desc(help_mean_var_desc, mode);
-
-  ::dnnl::prop_kind kind;
-  ::dnnl::normalization_flags flag = ::dnnl::normalization_flags::use_scale |
-                                     ::dnnl::normalization_flags::use_shift;
-  if (is_infer) {
-    kind = ::dnnl::prop_kind::forward_inference;
-    flag = ::dnnl::normalization_flags::use_global_stats | flag;
-  } else {
-    kind = ::dnnl::prop_kind::forward_training;
-  }
-  auto primitive_args =
-      create_primitive_args_or_get<::dnnl::batch_normalization_forward>(
-          kind, help_src_desc, help_dst_desc, epsilon, flag);
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, help_src_desc,
-             reordered_src ? reordered_src : src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_SCALE, help_scale_bias_desc,
-             reordered_scale ? reordered_scale : scale);
-  insert_arg(primitive_args.second.args, DNNL_ARG_SHIFT, help_scale_bias_desc,
-             reordered_bias ? reordered_bias : bias);
-  insert_arg(primitive_args.second.args, DNNL_ARG_MEAN, help_mean_var_desc,
-             reordered_saved_mean ? reordered_saved_mean
-                                            : saved_mean);
-  insert_arg(primitive_args.second.args, DNNL_ARG_VARIANCE, help_mean_var_desc,
-             reordered_saved_var ? reordered_saved_var
-                                           : saved_var);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, help_dst_desc,
-             reordered_dst ? reordered_dst : dst);
-  sycl::event e = execute_primitive<::dnnl::batch_normalization_forward>(primitive_args,
-                                    {{alpha, beta, DNNL_ARG_DST, help_dst_desc,
-                                      reordered_dst ? reordered_dst : dst}});
-
-  if (!is_infer && running_var) {
-    auto src_ndim = src_desc.get_desc().get_ndims();
-    auto src_dims = src_desc.get_dims();
-    int element_num = src_dims[0];
-    if (mode == batch_normalization_mode::spatial) {
-      for (int index = 2; index < src_ndim; index++) {
-        element_num *= src_dims[index];
-      }
-    }
-    float unbias_factor = element_num / (element_num - 1.f);
-    async_scale(1.f - factor, mean_var_desc, running_var);
-    e = async_sum(factor * unbias_factor, mean_var_desc,
-            reordered_saved_var ? reordered_saved_var : saved_var,
-            1.f, mean_var_desc, running_var);
-  }
-  if (!is_infer && running_mean) {
-    e = async_sum(factor, mean_var_desc,
-            reordered_saved_mean ? reordered_saved_mean : saved_mean,
-            (1.f - factor), mean_var_desc, running_mean);
-  }
-  if (reordered_dst && (actual_dst_desc != dst_desc.get_desc())) {
-    e = async_reorder(1.f, actual_dst_desc, reordered_dst, 0.f, dst_desc, dst);
-  }
-  if (!is_infer && reordered_saved_mean && reordered_saved_var && saved_mean &&
-      saved_var && (actual_mean_var_desc != mean_var_desc.get_desc())) {
-    e = async_reorder(1.f, actual_mean_var_desc, reordered_saved_mean, 0.f,
-                mean_var_desc, saved_mean);
-    e = async_reorder(1.f, actual_mean_var_desc, reordered_saved_var, 0.f,
-                mean_var_desc, saved_var);
-  }
-  return exit_primitive(e);
-}
-
-inline
-sycl::event engine_ext::rnn_forward_internal(
-    const rnn_desc &desc, ::dnnl::prop_kind kind,
-    const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
-    void *dst, const memory_desc_ext &iter_desc, void *src_iter, void *dst_iter,
-    const memory_desc_ext &iter_c_desc, void *src_iter_c, void *dst_iter_c,
-    size_t weight_size, void *weight, size_t workspace_size, void *workspace,
-    size_t scratchpad_size, void *scratchpad, bool is_get_execution_args,
-    size_t *weight_size_query, size_t *workspace_size_query,
-    size_t *scratchpad_size_query) {
-  ::dnnl::memory::data_type src_dt;
-  ::dnnl::memory::format_tag src_format_tag;
-  rnn_mode mode;
-  rnn_bias_mode bias_mode;
-  rnn_direction direction;
-  dpct::library_data_t dt;
-  int direction_num = 1, input_size = 0, hidden_size = 0, projection_size = 0,
-      layer_size = 0, gate_num = 1, output_size = 0, data_type_size = 0,
-      seq_length = 1, batch_size = 1;
-  std::vector<void *> data = {src,        dst,        src_iter, dst_iter,
-                              src_iter_c, dst_iter_c, weight,   workspace,
-                              scratchpad};
-  std::vector<int> offset(6, 0);
-  void *input_layer_cache = nullptr, *hidden_layer_cache = nullptr;
-  sycl::event e;
-  enter_primitive(src_desc.get_size() * 2);
-  desc.get(&mode, &bias_mode, &direction, &dt, &input_size, &hidden_size,
-           &projection_size, &layer_size);
-
-  get_rnn_configuration(src_desc.get_desc(), direction, mode, dt, hidden_size,
-                        &src_dt, &src_format_tag, &projection_size,
-                        &output_size, &seq_length, &batch_size, &direction_num,
-                        &gate_num);
-
-  if (direction == rnn_direction::bidirectional) {
-    // Here to combine the oneDNN bidirectional_sum and 
-    // bidirectional_concat config, so call execute_rnn_forward_primitive
-    // twice.
-    if (layer_size > 1) {
-      if (!is_get_execution_args) {
-        input_layer_cache = allocate(src_desc);
-        hidden_layer_cache = allocate(src_desc);
-        _q->memcpy(input_layer_cache, src, src_desc.get_size());
-      }
-      data[0] = input_layer_cache;
-      data[1] = hidden_layer_cache;
-      e = execute_rnn_forward_primitive(
-          mode, kind, ::dnnl::rnn_direction::bidirectional_sum, bias_mode,
-          src_dt, src_format_tag, seq_length, batch_size, output_size,
-          output_size, 1, direction_num, hidden_size, gate_num, projection_size,
-          data, offset, layer_size - 1, weight_size_query, workspace_size_query,
-          scratchpad_size_query);
-      data[0] =
-          ((layer_size - 1) % 2 == 0) ? input_layer_cache : hidden_layer_cache;
-      data[1] = dst;
-    }
-    e = execute_rnn_forward_primitive(
-        mode, kind, ::dnnl::rnn_direction::bidirectional_concat, bias_mode,
-        src_dt, src_format_tag, seq_length, batch_size, output_size,
-        2 * output_size, 1, direction_num, hidden_size, gate_num,
-        projection_size, data, offset, 1, weight_size_query,
-        workspace_size_query, scratchpad_size_query);
-  } else {
-    e = execute_rnn_forward_primitive(
-        mode, kind, ::dnnl::rnn_direction::unidirectional_left2right, bias_mode,
-        src_dt, src_format_tag, seq_length, batch_size, output_size,
-        output_size, layer_size, direction_num, hidden_size, gate_num,
-        projection_size, data, offset, 1, weight_size_query,
-        workspace_size_query, scratchpad_size_query);
-  }
-
-  return exit_primitive(e);
-}
-
-inline
-sycl::event engine_ext::execute_rnn_forward_primitive(
-    rnn_mode mode, ::dnnl::prop_kind kind, ::dnnl::rnn_direction direction,
-    rnn_bias_mode bias_mode, ::dnnl::memory::data_type dt,
-    ::dnnl::memory::format_tag tag, int seq_length, int batch_size, int src_c,
-    int dst_c, int layer_size, int direction_num, int hidden_size, int gate_num,
-    int projection_size, std::vector<void *> &data, std::vector<int> &offset,
-    int iter_num, size_t *weight_size, size_t *workspace_size,
-    size_t *scratchpad_size) {
-
-  sycl::event e;
-  ::dnnl::primitive *p = nullptr;
-  std::unordered_map<int, ::dnnl::memory> *args = nullptr;
-  detail::primitive_cache_key_type key;
-  std::unordered_map<int, ::dnnl::memory> *execution_args;
-  ::dnnl::memory::desc bias_desc(
-      {layer_size, direction_num, gate_num, hidden_size}, dt,
-      ::dnnl::memory::format_tag::ldgo);
-  ::dnnl::memory::desc weight_layer_desc(
-      {layer_size, direction_num,
-       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
-      dt, ::dnnl::memory::format_tag::ldigo);
-  ::dnnl::memory::desc weight_iter_desc(
-      {layer_size, direction_num,
-       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
-      dt, ::dnnl::memory::format_tag::ldigo);
-  ::dnnl::memory::desc projection_desc;
-  if (projection_size) {
-    projection_desc = ::dnnl::memory::desc(
-        {layer_size, direction_num, hidden_size, projection_size}, dt,
-        ::dnnl::memory::format_tag::ldio);
-  }
-
-  if (weight_size) {
-    *weight_size +=
-        (weight_layer_desc.get_size() + weight_iter_desc.get_size() +
-         projection_desc.get_size() + bias_desc.get_size()) *
-        iter_num;
-    return e;
-  }
-
-  ::dnnl::memory::desc src_desc({seq_length, batch_size, src_c}, dt, tag);
-  ::dnnl::memory::desc dst_desc({seq_length, batch_size, dst_c}, dt, tag);
-  ::dnnl::memory::desc iter_desc(
-      {layer_size, direction_num, batch_size,
-       projection_size ? projection_size : hidden_size},
-      dt, ::dnnl::memory::format_tag::ldnc);
-  ::dnnl::memory::desc iter_c_desc(
-      {layer_size, direction_num, batch_size, hidden_size}, dt,
-      ::dnnl::memory::format_tag::ldnc);
-
-  ::dnnl::memory::desc workspace_desc;
-  ::dnnl::memory::desc scratchpad_desc;
-  ::dnnl::primitive_attr attr;
-  attr.set_scratchpad_mode(::dnnl::scratchpad_mode::user);
-
-  if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) {
-    auto primitive = create_primitive_args_or_get<::dnnl::vanilla_rnn_forward>(
-        kind,
-        mode == rnn_mode::vanilla_relu ? ::dnnl::algorithm::eltwise_relu
-                                       : ::dnnl::algorithm::eltwise_tanh,
-        direction, src_desc, iter_desc, weight_layer_desc, weight_iter_desc,
-        bias_desc, dst_desc, iter_desc, attr);
-
-    auto pd = get_primitive_desc<::dnnl::vanilla_rnn_forward>(
-        primitive.second.primitive);
-
-    workspace_desc = pd.workspace_desc();
-    scratchpad_desc = pd.scratchpad_desc();
-    if (workspace_size && scratchpad_size) {
-      *workspace_size += workspace_desc.get_size() * iter_num;
-      *scratchpad_size = scratchpad_desc.get_size() > *scratchpad_size
-                             ? scratchpad_desc.get_size()
-                             : *scratchpad_size;
-    } else {
-      key = primitive.first;
-      p = primitive.second.primitive;
-      args = primitive.second.args;
-    }
-  } else if (mode == rnn_mode::gru) {
-    auto primitive = create_primitive_args_or_get<::dnnl::gru_forward>(
-        kind, direction, src_desc, iter_desc, weight_layer_desc,
-        weight_iter_desc, bias_desc, dst_desc, iter_desc, attr);
-
-    auto pd =
-        get_primitive_desc<::dnnl::gru_forward>(primitive.second.primitive);
-
-    workspace_desc = pd.workspace_desc();
-    scratchpad_desc = pd.scratchpad_desc();
-    if (workspace_size && scratchpad_size) {
-      *workspace_size += workspace_desc.get_size() * iter_num;
-      *scratchpad_size = scratchpad_desc.get_size() > *scratchpad_size
-                             ? scratchpad_desc.get_size()
-                             : *scratchpad_size;
-    } else {
-      key = primitive.first;
-      p = primitive.second.primitive;
-      args = primitive.second.args;
-    }
-  } else if (mode == rnn_mode::lstm) {
-    auto primitive = create_primitive_args_or_get<::dnnl::lstm_forward>(
-        kind, direction, src_desc, iter_desc, iter_c_desc, weight_layer_desc,
-        weight_iter_desc, ::dnnl::memory::desc(), projection_desc, bias_desc,
-        dst_desc, iter_desc, iter_c_desc, attr);
-
-    auto pd =
-        get_primitive_desc<::dnnl::lstm_forward>(primitive.second.primitive);
-
-    workspace_desc = pd.workspace_desc();
-    scratchpad_desc = pd.scratchpad_desc();
-    if (workspace_size && scratchpad_size) {
-      *workspace_size += workspace_desc.get_size() * iter_num;
-      *scratchpad_size = scratchpad_desc.get_size() > *scratchpad_size
-                             ? scratchpad_desc.get_size()
-                             : *scratchpad_size;
-    } else {
-      key = primitive.first;
-      p = primitive.second.primitive;
-      args = primitive.second.args;
-    }
-  }
-
-  for (int i = 0; i < iter_num; i++) {
-    void *in_cache = data[0], *out_cache = data[1], *dst_iter_c_cache = nullptr,
-         *dst_iter_cache = ((uint8_t *)(data[3]) + offset[1]);
-    if (mode == rnn_mode::lstm) {
-      dst_iter_c_cache = (uint8_t *)(data[4]) + offset[2];
-    }
-    if (!workspace_size) {
-      insert_arg(args, DNNL_ARG_SRC_LAYER, src_desc, data[0]);
-      insert_arg(args, DNNL_ARG_DST_LAYER, dst_desc, data[1]);
-      insert_arg(args, DNNL_ARG_SCRATCHPAD, scratchpad_desc, data[8]);
-      auto insert_rnn_arg = [&](int arg_name, ::dnnl::memory::desc &d, void *data,
-                             int &offset) {
-        insert_arg(args, arg_name, d, (uint8_t *)data + offset);
-        offset += d.get_size();
-      };
-      insert_rnn_arg(DNNL_ARG_SRC_ITER, iter_desc, data[2], offset[0]);
-      insert_rnn_arg(DNNL_ARG_DST_ITER, iter_desc, data[3], offset[1]);
-
-      if (mode == rnn_mode::lstm) {
-        insert_rnn_arg(DNNL_ARG_SRC_ITER_C, iter_c_desc, data[4], offset[2]);
-        insert_rnn_arg(DNNL_ARG_DST_ITER_C, iter_c_desc, data[5], offset[3]);
-      }
-      insert_rnn_arg(DNNL_ARG_WEIGHTS_LAYER, weight_layer_desc, data[6],
-                  offset[4]);
-      insert_rnn_arg(DNNL_ARG_WEIGHTS_ITER, weight_iter_desc, data[6], offset[4]);
-      if (projection_size) {
-        insert_rnn_arg(DNNL_ARG_WEIGHTS_PROJECTION, projection_desc, data[6],
-                    offset[4]);
-      }
-      if (bias_mode == rnn_bias_mode::none) {
-        _q->memset((uint8_t *)(data[6]) + offset[4], 0, bias_desc.get_size());
-      }
-      insert_rnn_arg(DNNL_ARG_BIAS, bias_desc, data[6], offset[4]);
-      if (kind == ::dnnl::prop_kind::forward_training) {
-        insert_rnn_arg(DNNL_ARG_WORKSPACE, workspace_desc, data[7], offset[5]);
-      }
-      if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) {
-        execute_primitive<::dnnl::vanilla_rnn_forward>(
-            {key, {static_cast<::dnnl::vanilla_rnn_forward *>(p), args}});
-      } else if (mode == rnn_mode::gru) {
-        execute_primitive<::dnnl::gru_forward>(
-            {key, {static_cast<::dnnl::gru_forward *>(p), args}});
-      } else if (mode == rnn_mode::lstm) {
-        execute_primitive<::dnnl::lstm_forward>(
-            {key, {static_cast<::dnnl::lstm_forward *>(p), args}});
-      }
-      if (i != iter_num - 1) {
-        std::swap(data[0], data[1]);
-      }
-    }
-    if (kind == ::dnnl::prop_kind::forward_training) {
-      if (workspace_size) {
-        *workspace_size +=
-            (src_desc.get_size() + dst_desc.get_size() + iter_desc.get_size());
-        if (mode == rnn_mode::lstm) {
-          *workspace_size += iter_c_desc.get_size();
-        }
-      } else {
-        _q->memcpy((uint8_t *)(data[7]) + offset[5], in_cache,
-                   src_desc.get_size());
-        offset[5] += src_desc.get_size();
-        _q->memcpy((uint8_t *)(data[7]) + offset[5], out_cache,
-                   dst_desc.get_size());
-        offset[5] += dst_desc.get_size();
-        _q->memcpy((uint8_t *)(data[7]) + offset[5], dst_iter_cache,
-                   iter_desc.get_size());
-        offset[5] += iter_desc.get_size();
-        if (mode == rnn_mode::lstm) {
-          _q->memcpy((uint8_t *)(data[7]) + offset[5], dst_iter_c_cache,
-                     iter_c_desc.get_size());
-          offset[5] += iter_c_desc.get_size();
-        }
-      }
-    }
-  }
-  return e;
-}
-
-inline
-sycl::event engine_ext::execute_rnn_backward_primitive(
-    rnn_mode mode, ::dnnl::rnn_direction direction, rnn_bias_mode bias_mode,
-    ::dnnl::memory::data_type dt, ::dnnl::memory::format_tag tag,
-    int seq_length, int batch_size, int src_c, int dst_c, int layer_size,
-    int direction_num, int hidden_size, int gate_num, int projection_size,
-    std::vector<void *> &data, std::vector<int> &offset, int iter_num) {
-
-  sycl::event e;
-  ::dnnl::primitive *p = nullptr;
-  std::unordered_map<int, ::dnnl::memory> *args = nullptr;
-  detail::primitive_cache_key_type key;
-  ::dnnl::prop_kind fkind = ::dnnl::prop_kind::forward_training;
-  ::dnnl::prop_kind bkind = ::dnnl::prop_kind::backward;
-  ::dnnl::memory::desc bias_desc(
-      {layer_size, direction_num, gate_num, hidden_size}, dt,
-      ::dnnl::memory::format_tag::ldgo);
-  ::dnnl::memory::desc weight_layer_desc(
-      {layer_size, direction_num,
-       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
-      dt, ::dnnl::memory::format_tag::ldigo);
-  ::dnnl::memory::desc weight_iter_desc(
-      {layer_size, direction_num,
-       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
-      dt, ::dnnl::memory::format_tag::ldigo);
-  ::dnnl::memory::desc diff_weight_layer_desc(
-      {layer_size, direction_num,
-       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
-      dt, ::dnnl::memory::format_tag::ldgoi);
-  ::dnnl::memory::desc diff_weight_iter_desc(
-      {layer_size, direction_num,
-       projection_size ? projection_size : hidden_size, gate_num, hidden_size},
-      dt, ::dnnl::memory::format_tag::ldgoi);
-  ::dnnl::memory::desc projection_desc, diff_projection_desc;
-  if (projection_size) {
-    projection_desc = ::dnnl::memory::desc(
-        {layer_size, direction_num, hidden_size, projection_size}, dt,
-        ::dnnl::memory::format_tag::ldio);
-    diff_projection_desc = ::dnnl::memory::desc(
-        {layer_size, direction_num, hidden_size, projection_size}, dt,
-        ::dnnl::memory::format_tag::ldoi);
-  }
-
-  ::dnnl::memory::desc src_desc({seq_length, batch_size, src_c}, dt, tag);
-  ::dnnl::memory::desc dst_desc({seq_length, batch_size, dst_c}, dt, tag);
-  ::dnnl::memory::desc iter_desc(
-      {layer_size, direction_num, batch_size,
-       projection_size ? projection_size : hidden_size},
-      dt, ::dnnl::memory::format_tag::ldnc);
-  ::dnnl::memory::desc iter_c_desc(
-      {layer_size, direction_num, batch_size, hidden_size}, dt,
-      ::dnnl::memory::format_tag::ldnc);
-
-  ::dnnl::memory::desc workspace_desc;
-  ::dnnl::memory::desc scratchpad_desc;
-  ::dnnl::primitive_attr attr;
-  attr.set_scratchpad_mode(::dnnl::scratchpad_mode::user);
-
-  if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) {
-    auto fpd = create_primitive_desc<::dnnl::vanilla_rnn_forward>(
-        fkind,
-        mode == rnn_mode::vanilla_relu ? ::dnnl::algorithm::eltwise_relu
-                                       : ::dnnl::algorithm::eltwise_tanh,
-        direction, src_desc, iter_desc, weight_layer_desc, weight_iter_desc,
-        bias_desc, dst_desc, iter_desc, attr);
-    auto primitive = create_primitive_args_or_get<::dnnl::vanilla_rnn_backward>(
-        bkind,
-        mode == rnn_mode::vanilla_relu ? ::dnnl::algorithm::eltwise_relu
-                                       : ::dnnl::algorithm::eltwise_tanh,
-        direction, src_desc, iter_desc, diff_weight_layer_desc,
-        diff_weight_iter_desc, bias_desc, dst_desc, iter_desc, src_desc,
-        iter_desc, weight_layer_desc, weight_iter_desc, bias_desc, dst_desc,
-        iter_desc, fpd, attr);
-    auto pd = get_primitive_desc<::dnnl::vanilla_rnn_backward>(
-        primitive.second.primitive);
-    workspace_desc = pd.workspace_desc();
-    scratchpad_desc = pd.scratchpad_desc();
-    key = primitive.first;
-    p = primitive.second.primitive;
-    args = primitive.second.args;
-  } else if (mode == rnn_mode::gru) {
-    auto fpd = create_primitive_desc<::dnnl::gru_forward>(
-        fkind, direction, src_desc, iter_desc, weight_layer_desc,
-        weight_iter_desc, bias_desc, dst_desc, iter_desc, attr);
-    auto primitive = create_primitive_args_or_get<::dnnl::gru_backward>(
-        bkind, direction, src_desc, iter_desc, diff_weight_layer_desc,
-        diff_weight_iter_desc, bias_desc, dst_desc, iter_desc, src_desc,
-        iter_desc, weight_layer_desc, weight_iter_desc, bias_desc, dst_desc,
-        iter_desc, fpd, attr);
-    auto pd =
-        get_primitive_desc<::dnnl::gru_backward>(primitive.second.primitive);
-    workspace_desc = pd.workspace_desc();
-    scratchpad_desc = pd.scratchpad_desc();
-    key = primitive.first;
-    p = primitive.second.primitive;
-    args = primitive.second.args;
-  } else if (mode == rnn_mode::lstm) {
-    auto fpd = create_primitive_desc<::dnnl::lstm_forward>(
-        fkind, direction, src_desc, iter_desc, iter_c_desc, weight_layer_desc,
-        weight_iter_desc, ::dnnl::memory::desc(), projection_desc, bias_desc,
-        dst_desc, iter_desc, iter_c_desc, attr);
-    auto primitive = create_primitive_args_or_get<::dnnl::lstm_backward>(
-        bkind, direction, src_desc, iter_desc, iter_c_desc,
-        diff_weight_layer_desc, diff_weight_iter_desc, ::dnnl::memory::desc(),
-        diff_projection_desc, bias_desc, dst_desc, iter_desc, iter_c_desc,
-        src_desc, iter_desc, iter_c_desc, weight_layer_desc, weight_iter_desc,
-        ::dnnl::memory::desc(), projection_desc, bias_desc, dst_desc, iter_desc,
-        iter_c_desc, fpd, attr);
-    auto pd =
-        get_primitive_desc<::dnnl::lstm_backward>(primitive.second.primitive);
-    workspace_desc = pd.workspace_desc();
-    scratchpad_desc = pd.scratchpad_desc();
-    key = primitive.first;
-    p = primitive.second.primitive;
-    args = primitive.second.args;
-  }
-
-  for (int i = 0; i < iter_num; i++) {
-    insert_arg(args, DNNL_ARG_DIFF_SRC_LAYER, src_desc, data[8]);
-    insert_arg(args, DNNL_ARG_DIFF_DST_LAYER, dst_desc, data[9]);
-    insert_arg(args, DNNL_ARG_SCRATCHPAD, scratchpad_desc, data[15]);
-    auto insert_rnn_arg = [&](int arg_name, ::dnnl::memory::desc &d, void *data,
-                           int &offset) {
-      offset += d.get_size();
-      insert_arg(args, arg_name, d, (uint8_t *)data - offset);
-    };
-    if (mode == rnn_mode::lstm) {
-      insert_rnn_arg(DNNL_ARG_DST_ITER_C, iter_c_desc, data[7], offset[0]);
-      insert_rnn_arg(DNNL_ARG_SRC_ITER_C, iter_c_desc, data[4], offset[2]);
-    }
-    insert_rnn_arg(DNNL_ARG_DST_ITER, iter_desc, data[7], offset[0]);
-    insert_rnn_arg(DNNL_ARG_DST_LAYER, dst_desc, data[7], offset[0]);
-    insert_rnn_arg(DNNL_ARG_SRC_LAYER, src_desc, data[7], offset[0]);
-    insert_rnn_arg(DNNL_ARG_WORKSPACE, workspace_desc, data[7], offset[0]);
-    insert_rnn_arg(DNNL_ARG_SRC_ITER, iter_desc, data[2], offset[1]);
-    insert_rnn_arg(DNNL_ARG_BIAS, bias_desc, data[6], offset[3]);
-    if (projection_size) {
-      insert_rnn_arg(DNNL_ARG_WEIGHTS_PROJECTION, diff_projection_desc, data[6],
-                  offset[3]);
-    }
-    insert_rnn_arg(DNNL_ARG_WEIGHTS_ITER, diff_weight_iter_desc, data[6],
-                offset[3]);
-    insert_rnn_arg(DNNL_ARG_WEIGHTS_LAYER, diff_weight_layer_desc, data[6],
-                offset[3]);
-    insert_rnn_arg(DNNL_ARG_DIFF_SRC_ITER, iter_desc, data[10], offset[4]);
-    insert_rnn_arg(DNNL_ARG_DIFF_DST_ITER, iter_desc, data[11], offset[5]);
-    if (mode == rnn_mode::lstm) {
-      insert_rnn_arg(DNNL_ARG_DIFF_SRC_ITER_C, iter_c_desc, data[12], offset[6]);
-      insert_rnn_arg(DNNL_ARG_DIFF_DST_ITER_C, iter_c_desc, data[13], offset[7]);
-    }
-    insert_rnn_arg(DNNL_ARG_DIFF_BIAS, bias_desc, data[14], offset[8]);
-    if (bias_mode == rnn_bias_mode::none) {
-      _q->memset((uint8_t *)(data[14]) - offset[8], 0, bias_desc.get_size());
-    }
-    if (projection_size) {
-      insert_rnn_arg(DNNL_ARG_DIFF_WEIGHTS_PROJECTION, projection_desc, data[14],
-                  offset[8]);
-    }
-    insert_rnn_arg(DNNL_ARG_DIFF_WEIGHTS_ITER, weight_iter_desc, data[14],
-                offset[8]);
-    insert_rnn_arg(DNNL_ARG_DIFF_WEIGHTS_LAYER, weight_layer_desc, data[14],
-                offset[8]);
-    if (mode == rnn_mode::vanilla_relu || mode == rnn_mode::vanilla_tanh) {
-      e = execute_primitive<::dnnl::vanilla_rnn_backward>(
-          {key, {static_cast<::dnnl::vanilla_rnn_backward *>(p), args}});
-    } else if (mode == rnn_mode::gru) {
-      e = execute_primitive<::dnnl::gru_backward>(
-          {key, {static_cast<::dnnl::gru_backward *>(p), args}});
-    } else if (mode == rnn_mode::lstm) {
-      e = execute_primitive<::dnnl::lstm_backward>(
-          {key, {static_cast<::dnnl::lstm_backward *>(p), args}});
-    }
-    if (i != iter_num - 1) {
-      std::swap(data[8], data[9]);
-    }
-  }
-  return e;
-}
-
-#define EMPTY_CACHE_KEY(type)                                                  \
-  template <>                                                                  \
-  inline void engine_ext::generate_cache_key<type>(std::string & key_buffer,   \
-                                                   const type &arg) {}
-
-EMPTY_CACHE_KEY(::dnnl::engine)
-EMPTY_CACHE_KEY(::dnnl::convolution_forward::primitive_desc)
-EMPTY_CACHE_KEY(::dnnl::eltwise_forward::primitive_desc)
-EMPTY_CACHE_KEY(::dnnl::softmax_forward::primitive_desc)
-EMPTY_CACHE_KEY(::dnnl::pooling_forward::primitive_desc)
-EMPTY_CACHE_KEY(::dnnl::lrn_forward::primitive_desc)
-EMPTY_CACHE_KEY(::dnnl::batch_normalization_forward::primitive_desc)
-EMPTY_CACHE_KEY(::dnnl::vanilla_rnn_forward::primitive_desc)
-EMPTY_CACHE_KEY(::dnnl::lstm_forward::primitive_desc)
-EMPTY_CACHE_KEY(::dnnl::gru_forward::primitive_desc)
-#undef EMPTY_CACHE_KEY
-
-template <>
-inline void engine_ext::generate_cache_key<std::vector<float>>(
-    std::string &key_buffer, const std::vector<float> &vec) {
-  key_buffer.append((char *)vec.data(), vec.size() * sizeof(float));
-}
-
-template <>
-inline void engine_ext::generate_cache_key<::dnnl::primitive_attr>(
-    std::string &key_buffer, const ::dnnl::primitive_attr &attr) {
-  if (!attr) {
-    return;
-  }
-  auto math_mode = (uint8_t)attr.get_fpmath_mode();
-  key_buffer.append((char *)&math_mode, sizeof(uint8_t));
-}
-
-template <>
-inline void engine_ext::generate_cache_key<::dnnl::memory::dims>(
-    std::string &key_buffer, const ::dnnl::memory::dims &dims) {
-  key_buffer.append((char *)dims.data(), dims.size() * sizeof(int64_t));
-}
-
-template <>
-inline void engine_ext::generate_cache_key<::dnnl::memory::desc>(
-    std::string &key_buffer, const ::dnnl::memory::desc &desc) {
-  uint8_t params[3] = {(uint8_t)desc.get_format_kind(),
-                       (uint8_t)desc.get_ndims(),
-                       (uint8_t)desc.get_data_type()};
-  generate_cache_key(key_buffer, desc.get_inner_blks());
-  generate_cache_key(key_buffer, desc.get_dims());
-  generate_cache_key(key_buffer, desc.get_strides());
-}
-
-template <typename T>
-void engine_ext::generate_cache_key(std::string &key_buffer, const T &arg) {
-  key_buffer.append((char *)&arg, sizeof(T));
-}
-
-template <typename T, typename... args_type>
-void engine_ext::generate_cache_key(std::string &key_buffer, const T &first_arg,
-                                    const args_type &...args) {
-  generate_cache_key(key_buffer, first_arg);
-  generate_cache_key(key_buffer, args...);
-}
-
-template <typename primitive_type, typename... args_type>
-std::pair<detail::primitive_cache_key_type, detail::primitive_and_args>
-engine_ext::create_primitive_args_or_get(args_type &&...args) {
-  std::string buffer;
-  buffer.reserve(512);
-  generate_cache_key(buffer, std::forward<args_type>(args)...);
-  buffer.append(std::to_string(_engine_id));
-  auto value = _primitive_cache.get(buffer);
-  primitive_type *p = nullptr;
-  std::unordered_map<int, ::dnnl::memory> *a = nullptr;
-  if (value) {
-    p = (primitive_type *)value->_primitive;
-    a = value->_args;
-  } else {
-    p = new primitive_type(create_primitive_desc<primitive_type>(
-        std::forward<args_type>(args)...));
-    a = new std::unordered_map<int, ::dnnl::memory>();
-  }
-  return {buffer, {p, a}};
-}
-
-template <typename primitive_type>
-typename primitive_type::primitive_desc
-engine_ext::get_primitive_desc(::dnnl::primitive *p) {
-  return typename primitive_type::primitive_desc(
-      const_cast<dnnl_primitive_desc_t>(p->get_primitive_desc()));
-}
-
-template <typename primitive_type, typename... args_type>
-typename primitive_type::primitive_desc
-engine_ext::create_primitive_desc(args_type &&...args) {
-  return typename primitive_type::primitive_desc(
-      *_eng, std::forward<args_type>(args)...);
-}
-
-inline
-void engine_ext::fill(const memory_desc_ext &src_desc, void *src,
-                      const void *valuePtr) {
-  async_fill(src_desc, src, valuePtr).wait();
-}
-
-inline
-void engine_ext::reorder(float alpha, const memory_desc_ext &src_desc,
-                         void *src, float beta, const memory_desc_ext &dst_desc,
-                         void *dst) {
-  async_reorder(alpha, src_desc, src, beta, dst_desc, dst).wait();
-}
-
-inline
-void engine_ext::scale(float alpha, const memory_desc_ext &src_desc,
-                       void *src) {
-  async_scale(alpha, src_desc, src).wait();
-}
-inline
-void engine_ext::sum(float alpha, const memory_desc_ext &src_desc, void *src,
-                     float beta, const memory_desc_ext &dst_desc, void *dst) {
-  async_sum(alpha, src_desc, src, beta, dst_desc, dst).wait();
-}
-inline
-void engine_ext::activation_forward(activation_desc &desc, float alpha,
-                                    const memory_desc_ext &src_desc, void *src,
-                                    float beta, const memory_desc_ext &dst_desc,
-                                    void *dst) {
-  async_activation_forward(desc, alpha, src_desc, src, beta, dst_desc, dst)
-      .wait();
-}
-inline
-void engine_ext::activation_backward(
-    activation_desc &desc, float alpha, const memory_desc_ext &dst_desc,
-    void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
-    const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &diff_src_desc, void *diff_src) {
-  async_activation_backward(desc, alpha, dst_desc, dst, diff_dst_desc, diff_dst,
-                            src_desc, src, beta, diff_src_desc, diff_src)
-      .wait();
-}
-inline
-void engine_ext::pooling_forward(pooling_desc &desc, float alpha,
-                                 const memory_desc_ext &src_desc, void *src,
-                                 float beta, const memory_desc_ext &dst_desc,
-                                 void *dst,
-                                 ::dnnl::memory *workspace) {
-  async_pooling_forward(desc, alpha, src_desc, src, beta, dst_desc, dst,
-                        workspace).wait();
-}
-
-inline
-void engine_ext::pooling_backward(
-    pooling_desc &desc, float alpha, const memory_desc_ext &dst_desc, void *dst,
-    const memory_desc_ext &diff_dst_desc, void *diff_dst,
-    const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &diff_src_desc, void *diff_src,
-    ::dnnl::memory *workspace) {
-  async_pooling_backward(desc, alpha, dst_desc, dst, diff_dst_desc, diff_dst,
-                         src_desc, src, beta, diff_src_desc, diff_src,
-                         workspace)
-      .wait();
-}
-
-inline
-void engine_ext::softmax_forward(softmax_algorithm alg, softmax_mode mode,
-                                 float alpha, const memory_desc_ext &src_desc,
-                                 void *src, float beta,
-                                 const memory_desc_ext &dst_desc, void *dst) {
-  async_softmax_forward(alg, mode, alpha, src_desc, src, beta, dst_desc, dst)
-      .wait();
-}
-
-inline
-void engine_ext::softmax_backward(softmax_algorithm alg, softmax_mode mode,
-                                  float alpha, const memory_desc_ext &dst_desc,
-                                  void *dst,
-                                  const memory_desc_ext &diff_dst_desc,
-                                  void *diff_dst, float beta,
-                                  const memory_desc_ext &diff_src_desc,
-                                  void *diff_src) {
-  async_softmax_backward(alg, mode, alpha, dst_desc, dst, diff_dst_desc,
-                         diff_dst, beta, diff_src_desc, diff_src)
-      .wait();
-}
-
-inline
-void engine_ext::lrn_forward(lrn_desc &desc, float alpha,
-                             const memory_desc_ext &src_desc, void *src,
-                             float beta, const memory_desc_ext &dst_desc,
-                             void *dst, ::dnnl::memory *workspace) {
-  async_lrn_forward(desc, alpha, src_desc, src, beta, dst_desc, dst, workspace)
-      .wait();
-}
-
-inline
-void engine_ext::lrn_backward(lrn_desc &desc, float alpha,
-                              const memory_desc_ext &dst_desc, void *dst,
-                              const memory_desc_ext &diff_dst_desc,
-                              void *diff_dst, const memory_desc_ext &src_desc,
-                              void *src, float beta,
-                              const memory_desc_ext &diff_src_desc,
-                              void *diff_src,
-                              ::dnnl::memory *workspace) {
-  async_lrn_backward(desc, alpha, dst_desc, dst, diff_dst_desc, diff_dst,
-                     src_desc, src, beta, diff_src_desc, diff_src, workspace)
-      .wait();
-}
-
-inline
-sycl::event engine_ext::async_fill(const memory_desc_ext &src_desc, void *src,
-                             const void *valuePtr) {
-  ::dnnl::memory::data_type dt = src_desc.get_desc().get_data_type();
-  unsigned mem_size = src_desc.get_size();
-  switch (dt) {
-  case ::dnnl::memory::data_type::f32:
-    return fill_with_type<float>(_q, src, valuePtr, mem_size);
-  case ::dnnl::memory::data_type::f16:
-    return fill_with_type<sycl::half>(_q, src, valuePtr, mem_size);
-  case ::dnnl::memory::data_type::s32:
-    return fill_with_type<int32_t>(_q, src, valuePtr, mem_size);
-  case ::dnnl::memory::data_type::s8:
-    return fill_with_type<int8_t>(_q, src, valuePtr, mem_size);
-  case ::dnnl::memory::data_type::u8:
-    return fill_with_type<uint8_t>(_q, src, valuePtr, mem_size);
-  default:
-    throw std::runtime_error("async_fill: unsupported data type.");
-  }
-}
-
-inline
-sycl::event engine_ext::async_reorder(float alpha, const memory_desc_ext &src_desc,
-                                void *src, float beta,
-                                const memory_desc_ext &dst_desc, void *dst) {
-  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
-    return sycl::event();
-  }
-  enter_primitive(2 * dst_desc.get_size());
-
-  auto primitive_args = create_primitive_args_or_get<::dnnl::reorder>(
-      src_desc.get_desc(), *_eng, dst_desc.get_desc());
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
-             src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-
-  return exit_primitive(execute_primitive<::dnnl::reorder>(
-      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
-}
-
-inline
-sycl::event engine_ext::async_scale(float alpha, const memory_desc_ext &src_desc,
-                              void *src) {
-  if (alpha == 1.f) {
-    return sycl::event();
-  }
-  size_t cache_size = src_desc.get_size();
-  enter_primitive(cache_size);
-  void *src_cache = allocate(cache_size);
-  _q->memcpy(src_cache, src, cache_size);
-  auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_forward>(
-      ::dnnl::prop_kind::forward_inference, ::dnnl::algorithm::eltwise_linear,
-      src_desc.get_desc(), src_desc.get_desc(), alpha, 0.f);
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
-             src_cache);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, src_desc.get_desc(),
-             src);
-
-  return exit_primitive(
-      execute_primitive<::dnnl::eltwise_forward>(primitive_args));
-}
-
-inline sycl::event
-engine_ext::async_sum(float alpha, const memory_desc_ext &src_desc, void *src,
-                      float beta, const memory_desc_ext &dst_desc, void *dst) {
-  if (alpha == 0.f && beta == 1.f) {
-    return sycl::event();
-  }
-  size_t cache_size = dst_desc.get_size();
-  enter_primitive(cache_size);
-  void *dst_cache = allocate(dst_desc);
-  _q->memcpy(dst_cache, dst, cache_size);
-
-  auto primitive_args = create_primitive_args_or_get<::dnnl::sum>(
-      std::vector<float>{alpha, beta},
-      std::vector<::dnnl::memory::desc>{src_desc.get_desc(),
-                                        dst_desc.get_desc()});
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_MULTIPLE_SRC,
-             src_desc.get_desc(), src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_MULTIPLE_SRC + 1,
-             dst_desc.get_desc(), dst_cache);
-
-  return exit_primitive(execute_primitive<::dnnl::sum>(primitive_args));
-}
-
-inline
-sycl::event engine_ext::async_binary(binary_op op, float alpha_0,
-                               const memory_desc_ext &src_desc_0, void *src_0,
-                               float alpha_1, const memory_desc_ext &src_desc_1,
-                               void *src_1, float beta,
-                               const memory_desc_ext &dst_desc, void *dst) {
-  ::dnnl::algorithm onednn_algorithm;
-  switch (op) {
-  case binary_op::max:
-    onednn_algorithm = ::dnnl::algorithm::binary_max;
-    break;
-  case binary_op::min:
-    onednn_algorithm = ::dnnl::algorithm::binary_min;
-    break;
-  case binary_op::add:
-    onednn_algorithm = ::dnnl::algorithm::binary_add;
-    break;
-  case binary_op::sub:
-    onednn_algorithm = ::dnnl::algorithm::binary_sub;
-    break;
-  case binary_op::mul:
-    onednn_algorithm = ::dnnl::algorithm::binary_mul;
-    break;
-  case binary_op::div:
-    onednn_algorithm = ::dnnl::algorithm::binary_div;
-    break;
-  case binary_op::sqrt:
-    onednn_algorithm = ::dnnl::algorithm::eltwise_sqrt;
-    break;
-  case binary_op::neg:
-    onednn_algorithm = ::dnnl::algorithm::eltwise_linear;
-    break;
-  }
-  size_t src0_cache_size = src_desc_0.get_size();
-  size_t src1_cache_size = src_desc_1.get_size();
-  size_t dst_cache_size = dst_desc.get_size();
-  enter_primitive(2 * src0_cache_size + 2 * src1_cache_size +
-                  5 * dst_cache_size);
-  if (onednn_algorithm == ::dnnl::algorithm::eltwise_sqrt ||
-      onednn_algorithm == ::dnnl::algorithm::eltwise_linear) {
-    void *src_cache = nullptr, *dst_cache = nullptr;
-    src_cache = allocate(src0_cache_size);
-    dst_cache = allocate(dst_cache_size);
-    _q->memcpy(src_cache, src_0, src0_cache_size);
-    _q->memcpy(dst_cache, dst, dst_cache_size);
-    async_scale(alpha_0, src_desc_0, src_cache);
-    async_scale(beta, dst_desc, dst_cache);
-
-    // Let the output = 1 - input to simulate the behavior of neg.
-    auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_forward>(
-        ::dnnl::prop_kind::forward_inference, onednn_algorithm,
-        src_desc_0.get_desc(), dst_desc.get_desc(), -1.f, 1.f);
-
-    insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc_0.get_desc(),
-               src_cache);
-    insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-               dst);
-
-    execute_primitive<::dnnl::eltwise_forward>(
-        primitive_args, {{1.f, 0.f, DNNL_ARG_DST, dst_desc, dst}});
-    return exit_primitive(
-        async_sum(1.f, dst_desc, dst_cache, 1.f, dst_desc, dst));
-  }
-
-  void *src_0_cache = nullptr, *src_1_cache = nullptr, *dst_cache = nullptr;
-
-  src_0_cache = allocate(src0_cache_size);
-  src_1_cache = allocate(src1_cache_size);
-  dst_cache = allocate(dst_cache_size);
-
-  _q->memcpy(src_0_cache, src_0, src0_cache_size);
-  _q->memcpy(src_1_cache, src_1, src1_cache_size);
-  _q->memcpy(dst_cache, dst, dst_cache_size);
-
-  async_scale(alpha_0, src_desc_0, src_0_cache);
-  async_scale(alpha_1, src_desc_1, src_1_cache);
-  async_scale(beta, dst_desc, dst_cache);
-
-  auto primitive_args = create_primitive_args_or_get<::dnnl::binary>(
-      onednn_algorithm, src_desc_0.get_desc(), src_desc_1.get_desc(),
-      dst_desc.get_desc());
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_0, src_desc_0.get_desc(),
-             src_0_cache);
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_1, src_desc_1.get_desc(),
-             src_1_cache);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-
-  execute_primitive<::dnnl::binary>(primitive_args,
-                                    {{1.f, 0.f, DNNL_ARG_DST, dst_desc, dst}});
-  return exit_primitive(
-      async_sum(1.f, dst_desc, dst_cache, 1.f, dst_desc, dst));
-}
-
-inline
-sycl::event engine_ext::async_reduction(reduction_op op, float alpha,
-                                  const memory_desc_ext &src_desc, void *src,
-                                  float beta, const memory_desc_ext &dst_desc,
-                                  void *dst) {
-  if (alpha == 0.f && beta == 1.f) {
-    return sycl::event();
-  }
-  size_t src_cache_size = src_desc.get_size();
-  size_t dst_cache_size = dst_desc.get_size();
-  enter_primitive(3 * src_cache_size + 2 * dst_cache_size);
-  float p = 2.f;
-  ::dnnl::algorithm onednn_algorithm;
-  void *cache = nullptr;
-  switch (op) {
-  case reduction_op::amax:
-    cache = allocate(src_cache_size);
-    activation_desc adesc;
-    adesc.set_algorithm(::dnnl::algorithm::eltwise_abs);
-    async_activation_forward(adesc, 1.f, src_desc, src, 0.f, src_desc, cache);
-    onednn_algorithm = ::dnnl::algorithm::reduction_max;
-    src = cache;
-    break;
-  case reduction_op::max:
-    onednn_algorithm = ::dnnl::algorithm::reduction_max;
-    break;
-  case reduction_op::min:
-    onednn_algorithm = ::dnnl::algorithm::reduction_min;
-    break;
-  case reduction_op::sum:
-    onednn_algorithm = ::dnnl::algorithm::reduction_sum;
-    break;
-  case reduction_op::mean:
-    onednn_algorithm = ::dnnl::algorithm::reduction_mean;
-    break;
-  case reduction_op::mul:
-    onednn_algorithm = ::dnnl::algorithm::reduction_mul;
-    break;
-  case reduction_op::mul_no_zeros:
-    cache = allocate(src_cache_size);
-    transform_no_zero(src_desc, src, cache);
-    onednn_algorithm = ::dnnl::algorithm::reduction_mul;
-    src = cache;
-    break;
-  case reduction_op::norm1:
-    p = 1.f;
-    onednn_algorithm = ::dnnl::algorithm::reduction_norm_lp_power_p_sum;
-    break;
-  case reduction_op::norm2:
-    onednn_algorithm = ::dnnl::algorithm::reduction_norm_lp_sum;
-    break;
-  }
-  auto primitive_args = create_primitive_args_or_get<::dnnl::reduction>(
-      onednn_algorithm, src_desc.get_desc(), dst_desc.get_desc(), p, 0.f);
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
-             src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-
-  return exit_primitive(execute_primitive<::dnnl::reduction>(
-      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
-}
-
-inline
-sycl::event engine_ext::async_activation_forward(activation_desc &desc, float alpha,
-                                           const memory_desc_ext &src_desc,
-                                           void *src, float beta,
-                                           const memory_desc_ext &dst_desc,
-                                           void *dst) {
-  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
-    return sycl::event();
-  }
-  enter_primitive(2 * dst_desc.get_size());
-  auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_forward>(
-      ::dnnl::prop_kind::forward, desc.get_algorithm(), src_desc.get_desc(),
-      dst_desc.get_desc(), desc.get_alpha(), desc.get_beta());
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
-             src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-
-  return exit_primitive(execute_primitive<::dnnl::eltwise_forward>(
-      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
-}
-
-inline
-sycl::event engine_ext::async_activation_backward(
-    activation_desc &desc, float alpha, const memory_desc_ext &dst_desc,
-    void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
-    const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &diff_src_desc, void *diff_src) {
-
-  if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) {
-    return sycl::event();
-  }
-  enter_primitive(2 * diff_src_desc.get_size());
-  ::dnnl::memory::desc data_desc = dst_desc.get_desc();
-  auto alg = desc.get_algorithm();
-  if ((alg == ::dnnl::algorithm::eltwise_clip) ||
-      (alg == ::dnnl::algorithm::eltwise_linear) ||
-      (alg == ::dnnl::algorithm::eltwise_swish)) {
-    data_desc = src_desc.get_desc();
-  }
-  auto primitive_args = create_primitive_args_or_get<::dnnl::eltwise_backward>(
-      alg, diff_src_desc.get_desc(), diff_dst_desc.get_desc(), data_desc,
-      desc.get_alpha(), desc.get_beta(),
-      create_primitive_desc<::dnnl::eltwise_forward>(
-          ::dnnl::prop_kind::forward, alg, src_desc.get_desc(),
-          dst_desc.get_desc(), desc.get_alpha(), desc.get_beta()));
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
-             src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST,
-             diff_dst_desc.get_desc(), diff_dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC,
-             diff_src_desc.get_desc(), diff_src);
-
-  return exit_primitive(execute_primitive<::dnnl::eltwise_backward>(
-      primitive_args,
-      {{alpha, beta, DNNL_ARG_DIFF_SRC, diff_src_desc, diff_src}}));
-}
-
-inline
-sycl::event engine_ext::async_pooling_forward(pooling_desc &desc, float alpha,
-                                        const memory_desc_ext &src_desc,
-                                        void *src, float beta,
-                                        const memory_desc_ext &dst_desc,
-                                        void *dst, ::dnnl::memory *workspace) {
-  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
-    return sycl::event();
-  }
-  enter_primitive(2 * dst_desc.get_size());
-  int pooling_dim = desc.get_stride().size();
-  std::vector<int64_t> dilation(pooling_dim, 0);
-  auto primitive_args =
-      create_primitive_args_or_get<::dnnl::pooling_forward>(
-          ::dnnl::prop_kind::forward_training, desc.get_algorithm(),
-          src_desc.get_desc(), dst_desc.get_desc(), desc.get_stride(),
-          desc.get_kernel(), dilation, desc.get_padding(), desc.get_padding());
-  auto pd = get_primitive_desc<::dnnl::pooling_forward>(
-      primitive_args.second.primitive);
-  ::dnnl::memory ws_mem(pd.workspace_desc(), *_eng);
-  if (workspace) {
-    *workspace = ws_mem;
-  } else {
-    insert_workspace(src, ws_mem);
-  }
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
-             src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, ws_mem);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-
-  return exit_primitive(execute_primitive<::dnnl::pooling_forward>(
-      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
-}
-
-inline
-sycl::event engine_ext::async_pooling_backward(
-    pooling_desc &desc, float alpha, const memory_desc_ext &dst_desc, void *dst,
-    const memory_desc_ext &diff_dst_desc, void *diff_dst,
-    const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &diff_src_desc, void *diff_src,
-    ::dnnl::memory *workspace) {
-  if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) {
-    return sycl::event();
-  }
-  enter_primitive(2 * diff_src_desc.get_size());
-  int pooling_dim = desc.get_stride().size();
-  std::vector<int64_t> dilation(pooling_dim, 0);
-  auto primitive_args = create_primitive_args_or_get<::dnnl::pooling_backward>(
-      desc.get_algorithm(), diff_src_desc.get_desc(), diff_dst_desc.get_desc(),
-      desc.get_stride(), desc.get_kernel(), dilation, desc.get_padding(),
-      desc.get_padding(),
-      create_primitive_desc<::dnnl::pooling_forward>(
-          ::dnnl::prop_kind::forward_training, desc.get_algorithm(),
-          src_desc.get_desc(), dst_desc.get_desc(), desc.get_stride(),
-          desc.get_kernel(), dilation, desc.get_padding(), desc.get_padding()));
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
-             src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST,
-             diff_dst_desc.get_desc(), diff_dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC,
-             diff_src_desc.get_desc(), diff_src);
-
-  if (workspace) {
-    insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, *workspace);
-  } else {
-    insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE,
-               get_workspace(src));
-  }
-
-  return exit_primitive(execute_primitive<::dnnl::pooling_backward>(
-      primitive_args,
-      {{alpha, beta, DNNL_ARG_DIFF_SRC, diff_src_desc, diff_src}}));
-}
-
-inline
-sycl::event engine_ext::async_softmax_forward(softmax_algorithm alg,
-                                        softmax_mode mode, float alpha,
-                                        const memory_desc_ext &src_desc,
-                                        void *src, float beta,
-                                        const memory_desc_ext &dst_desc,
-                                        void *dst) {
-  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
-    return sycl::event();
-  }
-
-  ::dnnl::memory::desc help_src_desc = src_desc.get_desc();
-  ::dnnl::memory::desc help_dst_desc = dst_desc.get_desc();
-  if (mode == softmax_mode::instance) {
-    help_src_desc = compress_spatial_dimensions_to_channel(help_src_desc);
-    help_dst_desc = compress_spatial_dimensions_to_channel(help_dst_desc);
-  }
-  enter_primitive(2 * help_dst_desc.get_size());
-
-  ::dnnl::algorithm softmax_alg = ::dnnl::algorithm::softmax_accurate;
-  if (alg == softmax_algorithm::log) {
-    softmax_alg = ::dnnl::algorithm::softmax_log;
-  }
-  auto primitive_args = create_primitive_args_or_get<::dnnl::softmax_forward>(
-      ::dnnl::prop_kind::forward, softmax_alg, help_src_desc, 
-      help_dst_desc, 1);
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, help_dst_desc, dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, help_src_desc, src);
-
-  return exit_primitive(execute_primitive<::dnnl::softmax_forward>(
-      primitive_args,
-      {{alpha, beta, DNNL_ARG_DST, memory_desc_ext(help_dst_desc), dst}}));
-}
-
-inline
-sycl::event engine_ext::async_softmax_backward(
-    softmax_algorithm alg, softmax_mode mode, float alpha,
-    const memory_desc_ext &dst_desc, void *dst,
-    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
-    const memory_desc_ext &diff_src_desc, void *diff_src) {
-  if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) {
-    return sycl::event();
-  }
-  ::dnnl::memory::desc help_diff_src_desc = diff_src_desc.get_desc();
-  ::dnnl::memory::desc help_dst_desc = dst_desc.get_desc();
-  ::dnnl::memory::desc help_diff_dst_desc = diff_dst_desc.get_desc();
-  if (mode == softmax_mode::instance) {
-    help_diff_src_desc =
-        compress_spatial_dimensions_to_channel(help_diff_src_desc);
-    help_dst_desc = compress_spatial_dimensions_to_channel(help_dst_desc);
-    help_diff_dst_desc =
-        compress_spatial_dimensions_to_channel(help_diff_dst_desc);
-  }
-  enter_primitive(2 * help_diff_src_desc.get_size());
-
-  ::dnnl::algorithm softmax_alg = ::dnnl::algorithm::softmax_accurate;
-  if (alg == softmax_algorithm::log) {
-    softmax_alg = ::dnnl::algorithm::softmax_log;
-  }
-
-  auto primitive_args = create_primitive_args_or_get<::dnnl::softmax_backward>(
-      softmax_alg, help_diff_src_desc, help_diff_dst_desc, help_dst_desc, 1,
-      create_primitive_desc<::dnnl::softmax_forward>(
-          ::dnnl::prop_kind::forward, softmax_alg, help_diff_src_desc,
-          help_dst_desc, 1));
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, help_dst_desc, dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, help_diff_dst_desc,
-             diff_dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, help_diff_src_desc,
-             diff_src);
-
-  return exit_primitive(execute_primitive<::dnnl::softmax_backward>(
-      primitive_args, {{alpha, beta, DNNL_ARG_DIFF_SRC,
-                        memory_desc_ext(help_diff_src_desc), diff_src}}));
-}
-
-inline
-sycl::event engine_ext::async_lrn_forward(lrn_desc &desc, float alpha,
-                                    const memory_desc_ext &src_desc, void *src,
-                                    float beta, const memory_desc_ext &dst_desc,
-                                    void *dst, ::dnnl::memory *workspace) {
-
-  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
-    return sycl::event();
-  }
-  enter_primitive(2 * dst_desc.get_size());
-  auto primitive_args = create_primitive_args_or_get<::dnnl::lrn_forward>(
-      ::dnnl::prop_kind::forward_training,
-      ::dnnl::algorithm::lrn_across_channels, src_desc.get_desc(),
-      dst_desc.get_desc(), desc.get_local_size(), desc.get_alpha(),
-      desc.get_beta(), desc.get_k());
-  auto pd =
-      get_primitive_desc<::dnnl::lrn_forward>(primitive_args.second.primitive);
-  ::dnnl::memory ws_mem(pd.workspace_desc(), *_eng);
-  if (workspace) {
-    *workspace = ws_mem;
-  } else {
-    insert_workspace(src, ws_mem);
-  }
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
-             src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, ws_mem);
-
-  return exit_primitive(execute_primitive<::dnnl::lrn_forward>(
-      primitive_args, {{alpha, beta, DNNL_ARG_DST, dst_desc, dst}}));
-}
-
-inline
-sycl::event
-engine_ext::async_lrn_backward(lrn_desc &desc, float alpha,
-                         const memory_desc_ext &dst_desc, void *dst,
-                         const memory_desc_ext &diff_dst_desc, void *diff_dst,
-                         const memory_desc_ext &src_desc, void *src, float beta,
-                         const memory_desc_ext &diff_src_desc, void *diff_src,
-                         ::dnnl::memory *workspace) {
-
-  if (scale_parameter_preprocess({{alpha, beta, diff_src_desc, diff_src}})) {
-    return sycl::event();
-  }
-  enter_primitive(2 * diff_src_desc.get_size());
-  auto primitive_args = create_primitive_args_or_get<::dnnl::lrn_backward>(
-      ::dnnl::algorithm::lrn_across_channels, diff_src_desc.get_desc(),
-      diff_dst_desc.get_desc(), src_desc.get_desc(), desc.get_local_size(),
-      desc.get_alpha(), desc.get_beta(), desc.get_k(),
-      create_primitive_desc<::dnnl::lrn_forward>(
-          ::dnnl::prop_kind::forward_training,
-          ::dnnl::algorithm::lrn_across_channels, src_desc.get_desc(),
-          dst_desc.get_desc(), desc.get_local_size(), desc.get_alpha(),
-          desc.get_beta(), desc.get_k()));
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, src_desc.get_desc(),
-             src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST,
-             diff_dst_desc.get_desc(), diff_dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC,
-             diff_src_desc.get_desc(), diff_src);
-
-  if (workspace) {
-    insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE, *workspace);
-  } else {
-    insert_arg(primitive_args.second.args, DNNL_ARG_WORKSPACE,
-               get_workspace(src));
-  }
-
-  return exit_primitive(execute_primitive<::dnnl::lrn_backward>(
-      primitive_args,
-      {{alpha, beta, DNNL_ARG_DIFF_SRC, diff_src_desc, diff_src}}));
-}
-
-inline
-size_t engine_ext::get_batch_normalization_workspace_size(
-    batch_normalization_ops ops, const memory_desc_ext &src_desc) {
-  if(ops == batch_normalization_ops::none) {
-    return 0;
-  }
-  return src_desc.get_size();
-}
-
-inline
-sycl::event engine_ext::async_batch_normalization_forward_inference(
-    batch_normalization_mode mode, float epsilon, float alpha,
-    const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &dst_desc, void *dst,
-    const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
-    void *mean, void *var) {
-
-  return batch_normalization_forward_internal(
-      true, mode, epsilon, 0.f, alpha, src_desc, src, beta, dst_desc, dst,
-      scale_bias_mean_var_desc, scale, bias, scale_bias_mean_var_desc, mean,
-      var, nullptr, nullptr);
-}
-
-inline
-sycl::event engine_ext::async_batch_normalization_forward_inference(
-    batch_normalization_mode mode, batch_normalization_ops ops,
-    activation_desc &adesc, float epsilon, float alpha,
-    const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &dst_desc, void *dst,
-    const memory_desc_ext &summand_desc, void *summand,
-    const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
-    const memory_desc_ext &mean_var_desc, void *mean, void *var) {
-
-  bool has_post_op = (ops != batch_normalization_ops::none);
-  sycl::event e;
-  enter_primitive(src_desc.get_size() + dst_desc.get_size() * 4 +
-                  scale_bias_desc.get_size() * 2 +
-                  mean_var_desc.get_size() * 5);
-  if (has_post_op) {
-    void *dst_cache = allocate(dst_desc);
-    batch_normalization_forward_internal(
-        true, mode, epsilon, 0.f, 1.f, src_desc, src, 0.f, dst_desc, dst_cache,
-        scale_bias_desc, scale, bias, mean_var_desc, mean, var, nullptr,
-        nullptr);
-
-    if (ops == batch_normalization_ops::add_activation) {
-      async_sum(1.f, summand_desc, summand, 1.f, dst_desc, dst_cache);
-    }
-    async_activation_forward(adesc, 1.f, dst_desc, dst_cache, 0.f, dst_desc,
-                       dst_cache);
-    return exit_primitive(
-        async_sum(alpha, dst_desc, dst_cache, beta, dst_desc, dst));
-  }
-  return exit_primitive(batch_normalization_forward_internal(
-      true, mode, epsilon, 0.f, alpha, src_desc, src, beta, dst_desc, dst,
-      scale_bias_desc, scale, bias, mean_var_desc, mean, var, nullptr,
-      nullptr));
-}
-
-inline
-sycl::event engine_ext::async_batch_normalization_forward_training(
-    batch_normalization_mode mode, float epsilon, float factor, float alpha,
-    const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &dst_desc, void *dst,
-    const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
-    void *running_mean, void *running_var, void *saved_mean, void *saved_var) {
-  return batch_normalization_forward_internal(
-      false, mode, epsilon, factor, alpha, src_desc, src, beta, dst_desc, dst,
-      scale_bias_mean_var_desc, scale, bias, scale_bias_mean_var_desc,
-      saved_mean, saved_var, running_mean, running_var);
-}
-
-inline
-sycl::event engine_ext::async_batch_normalization_forward_training(
-    batch_normalization_mode mode, batch_normalization_ops ops,
-    activation_desc &adesc, float epsilon, float factor, float alpha,
-    const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &dst_desc, void *dst,
-    const memory_desc_ext &summand_desc, void *summand,
-    const memory_desc_ext &scale_bias_desc, void *scale, void *bias,
-    const memory_desc_ext &mean_var_desc, void *running_mean, void *running_var,
-    void *saved_mean, void *saved_var, size_t workspace_size,
-    void *workspace) {
-  enter_primitive(src_desc.get_size() + dst_desc.get_size() * 3 +
-                  mean_var_desc.get_size() * 5 +
-                  scale_bias_desc.get_size() * 2);
-  bool has_post_op = (ops != batch_normalization_ops::none);
-  sycl::event e;
-  if (has_post_op) {
-    if(workspace_size < dst_desc.get_desc().get_size()) {
-      throw std::runtime_error("async_batch_normalization_forward_training_ex: "
-        "no sufficient workspace.");
-    }
-    batch_normalization_forward_internal(
-        false, mode, epsilon, factor, 1.f, src_desc, src, 0.f, dst_desc,
-        workspace, scale_bias_desc, scale, bias, mean_var_desc,
-        saved_mean, saved_var, running_mean, running_var);
-    if (ops == batch_normalization_ops::add_activation) {
-      async_sum(1.f, summand_desc, summand, 1.f, dst_desc,
-          workspace);
-    }
-    return exit_primitive(async_activation_forward(
-        adesc, alpha, dst_desc, workspace, beta, dst_desc, dst));
-  }
-  return exit_primitive(batch_normalization_forward_internal(
-      false, mode, epsilon, factor, alpha, src_desc, src, beta, dst_desc, dst,
-      scale_bias_desc, scale, bias, mean_var_desc, saved_mean, saved_var,
-      running_mean, running_var));
-}
-
-inline
-sycl::event engine_ext::async_batch_normalization_forward_training(
-    batch_normalization_mode mode, batch_normalization_ops ops,
-    activation_desc &adesc, float epsilon, float factor, float alpha,
-    const memory_desc_ext &src_desc, void *src, float beta,
-    const memory_desc_ext &dst_desc, void *dst,
-    const memory_desc_ext &summand_desc, void *summand,
-    const memory_desc_ext &scale_bias_mean_var_desc, void *scale, void *bias,
-    void *running_mean, void *running_var, void *saved_mean, void *saved_var,
-    size_t workspace_size, void *workspace) {
-  return async_batch_normalization_forward_training(
-      mode, ops, adesc, epsilon, factor, alpha, src_desc, src, beta, dst_desc,
-      dst, summand_desc, summand, scale_bias_mean_var_desc, scale, bias,
-      scale_bias_mean_var_desc, running_mean, running_var, saved_mean,
-      saved_var, workspace_size, workspace);
-}
-
-inline
-sycl::event engine_ext::async_batch_normalization_backward(
-    batch_normalization_mode mode, float epsilon, float alpha_data,
-    const memory_desc_ext &src_desc, void *src,
-    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta_data,
-    const memory_desc_ext &diff_src_desc, void *diff_src, float alpha_param,
-    const memory_desc_ext &diff_scale_bias_mean_var_desc, void *scale,
-    float beta_param, void *diff_scale, void *diff_bias, void *saved_mean,
-    void *saved_var) {
-
-  return batch_normalization_backward_internal(
-      mode, epsilon, alpha_data, src_desc, src, diff_dst_desc, diff_dst,
-      beta_data, diff_src_desc, diff_src, alpha_param,
-      diff_scale_bias_mean_var_desc, scale, nullptr, beta_param, diff_scale,
-      diff_bias, diff_scale_bias_mean_var_desc, saved_mean, saved_var);
-}
-
-inline
-sycl::event engine_ext::async_batch_normalization_backward(
-    batch_normalization_mode mode, batch_normalization_ops ops,
-    activation_desc &adesc, float epsilon, float alpha_data,
-    const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
-    void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
-    float beta_data, const memory_desc_ext &diff_src_desc, void *diff_src,
-    const memory_desc_ext &diff_summand_desc, void *diff_summand,
-    float alpha_param, const memory_desc_ext &diff_scale_bias_desc, void *scale,
-    void *bias, float beta_param, void *diff_scale, void *diff_bias,
-    const memory_desc_ext &mean_var_desc, void *saved_mean, void *saved_var,
-    size_t workspace_size, void *workspace) {
-  std::vector<void *> caches;
-  ::dnnl::memory::desc real_diff_dst_desc = diff_dst_desc.get_desc();
-  void *real_diff_dst = diff_dst;
-
-  if (ops != batch_normalization_ops::none &&
-      workspace_size < dst_desc.get_desc().get_size()) {
-    throw std::runtime_error("async_batch_normalization_backward_ex: "
-                             "no sufficient workspace.");
-  }
-  enter_primitive(diff_scale_bias_desc.get_size() * 8 +
-                  src_desc.get_size() * 3 + diff_dst_desc.get_size() * 5 +
-                  diff_src_desc.get_size() + mean_var_desc.get_size() * 9 +
-                  diff_summand_desc.get_size());
-  if (ops == batch_normalization_ops::add_activation) {
-    void *diff_summand_cache = allocate(diff_summand_desc);
-    async_activation_backward(adesc, 1.f, dst_desc, dst, diff_dst_desc, diff_dst,
-                        dst_desc, workspace, 0.f,
-                        diff_summand_desc, diff_summand_cache);
-    async_sum(alpha_data, diff_summand_desc, diff_summand_cache, beta_data,
-        diff_summand_desc, diff_summand);
-    real_diff_dst_desc = diff_summand_desc.get_desc();
-    real_diff_dst = diff_summand_cache;
-  } else if (ops == batch_normalization_ops::activation) {
-    void *diff_dst_cache = allocate(diff_dst_desc);
-    async_activation_backward(adesc, 1.f, dst_desc, dst, diff_dst_desc,
-                        diff_dst, dst_desc, workspace,
-                        0.f, diff_dst_desc, diff_dst_cache);
-    real_diff_dst = diff_dst_cache;
-  }
-
-  return exit_primitive(batch_normalization_backward_internal(
-      mode, epsilon, alpha_data, src_desc, src, real_diff_dst_desc,
-      real_diff_dst, beta_data, diff_src_desc, diff_src, alpha_param,
-      diff_scale_bias_desc, scale, bias, beta_param, diff_scale, diff_bias,
-      mean_var_desc, saved_mean, saved_var));
-}
-
-inline
-sycl::event engine_ext::async_batch_normalization_backward(
-    batch_normalization_mode mode, batch_normalization_ops ops,
-    activation_desc &adesc, float epsilon, float alpha_data,
-    const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
-    void *dst, const memory_desc_ext &diff_dst_desc, void *diff_dst,
-    float beta_data, const memory_desc_ext &diff_src_desc, void *diff_src,
-    const memory_desc_ext &diff_summand_desc, void *diff_summand,
-    float alpha_param, const memory_desc_ext &diff_scale_bias_mean_var_desc,
-    void *scale, void *bias, float beta_param, void *diff_scale,
-    void *diff_bias, void *saved_mean, void *saved_var,
-    size_t workspace_size, void *workspace) {
-
-  return async_batch_normalization_backward(
-      mode, ops, adesc, epsilon, alpha_data, src_desc, src, dst_desc, dst,
-      diff_dst_desc, diff_dst, beta_data, diff_src_desc, diff_src,
-      diff_summand_desc, diff_summand, alpha_param,
-      diff_scale_bias_mean_var_desc, scale, bias, beta_param, diff_scale,
-      diff_bias, diff_scale_bias_mean_var_desc, saved_mean, saved_var,
-      workspace_size, workspace);
-}
-
-inline
-sycl::event
-engine_ext::async_convolution_forward(convolution_desc &desc, ::dnnl::algorithm alg,
-                                float alpha, const memory_desc_ext &src_desc,
-                                void *src, const memory_desc_ext &weight_desc,
-                                void *weight, float beta,
-                                const memory_desc_ext &dst_desc, void *dst) {
-  if (scale_parameter_preprocess({{alpha, beta, dst_desc, dst}})) {
-    return sycl::event();
-  }
-  auto help_weight_desc =
-      get_group_weight_desc(desc.get_group_count(), weight_desc);
-
-  ::dnnl::primitive_attr attr;
-  attr.set_fpmath_mode(desc.get_math_mode());
-
-  auto origin_src_md = src_desc.get_desc();
-  auto origin_dst_md = dst_desc.get_desc();
-  auto origin_weight_md = help_weight_desc;
-  auto src_md = transfer_memory_desc_to_format_tag_any(origin_src_md);
-  auto dst_md = transfer_memory_desc_to_format_tag_any(origin_dst_md);
-  auto weight_md = transfer_memory_desc_to_format_tag_any(origin_weight_md);
-
-  auto primitive_args =
-      create_primitive_args_or_get<::dnnl::convolution_forward>(
-          ::dnnl::prop_kind::forward_training, alg, src_md, weight_md, dst_md,
-          desc.get_stride(), desc.get_dilate(), desc.get_padding(),
-          desc.get_padding(), attr);
-
-  auto pd = get_primitive_desc<::dnnl::convolution_forward>(
-      primitive_args.second.primitive);
-  auto optimal_src_md = pd.src_desc();
-  auto optimal_dst_md = pd.dst_desc();
-  auto optimal_weight_md = pd.weights_desc();
-
-  enter_primitive(
-      optimal_src_md.get_size() * 3 + optimal_dst_md.get_size() * 5 +
-      optimal_weight_md.get_size() * 3 + origin_dst_md.get_size() * 2);
-
-  void *optimal_src = src, *optimal_dst = dst, *optimal_weight = weight;
-  allocate_and_reorder_memory_to_optimal(origin_src_md, src, optimal_src_md,
-                                         optimal_src);
-  allocate_and_reorder_memory_to_optimal(origin_weight_md, weight,
-                                         optimal_weight_md, optimal_weight);
-
-  if (beta == 0.f) {
-    if(origin_dst_md != optimal_dst_md) {
-      optimal_dst = allocate(optimal_dst_md);
-    }
-  } else {
-    allocate_and_reorder_memory_to_optimal(origin_dst_md, dst, optimal_dst_md,
-                                           optimal_dst);
-  }
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, optimal_src_md,
-             optimal_src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md,
-             optimal_weight);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, optimal_dst_md,
-             optimal_dst);
-
-  auto e = execute_primitive<::dnnl::convolution_forward>(
-      primitive_args,
-      {{alpha, beta, DNNL_ARG_DST, optimal_dst_md, optimal_dst}});
-
-  if (origin_dst_md != optimal_dst_md) {
-    e = async_reorder(1.f, optimal_dst_md, optimal_dst, 0.f, origin_dst_md,
-                      dst);
-  }
-  return exit_primitive(e);
-}
-
-inline
-sycl::event engine_ext::async_convolution_forward(
-    convolution_desc &desc, ::dnnl::algorithm alg, activation_desc &adesc,
-    float alpha_0, const memory_desc_ext &src_desc, void *src,
-    const memory_desc_ext &weight_desc, void *weight, float alpha_1,
-    const memory_desc_ext &summand_desc, void *summand,
-    const memory_desc_ext &bias_desc, void *bias,
-    const memory_desc_ext &dst_desc, void *dst) {
-
-  int channel_num = bias_desc.get_element_num();
-  auto help_weight_desc =
-      get_group_weight_desc(desc.get_group_count(), weight_desc);
-  ::dnnl::memory::desc help_bias_desc = {{channel_num},
-                                         bias_desc.get_desc().get_data_type(),
-                                         ::dnnl::memory::format_tag::a};
-  auto origin_weight_md = help_weight_desc;
-  auto origin_bias_md = help_bias_desc;
-  auto origin_src_md = src_desc.get_desc();
-  auto origin_dst_md = dst_desc.get_desc();
-  auto src_md = transfer_memory_desc_to_format_tag_any(origin_src_md);
-  auto dst_md = transfer_memory_desc_to_format_tag_any(origin_dst_md);
-  auto weight_md = transfer_memory_desc_to_format_tag_any(origin_weight_md);
-  auto bias_md = transfer_memory_desc_to_format_tag_any(origin_bias_md);
-
-  ::dnnl::primitive_attr attr;
-  attr.set_fpmath_mode(desc.get_math_mode());
-
-  auto primitive_args =
-      create_primitive_args_or_get<::dnnl::convolution_forward>(
-          ::dnnl::prop_kind::forward_training, alg, src_md, weight_md, bias_md,
-          dst_md, desc.get_stride(), desc.get_dilate(), desc.get_padding(),
-          desc.get_padding(), attr);
-
-  auto pd = get_primitive_desc<::dnnl::convolution_forward>(
-      primitive_args.second.primitive);
-  auto optimal_src_md = pd.src_desc();
-  auto optimal_dst_md = pd.dst_desc();
-  auto optimal_weight_md = pd.weights_desc();
-  auto optimal_bias_md = pd.bias_desc();
-
-  enter_primitive(optimal_src_md.get_size() + 3 * optimal_weight_md.get_size() +
-                  optimal_bias_md.get_size() + 7 * optimal_dst_md.get_size() +
-                  summand_desc.get_size());
-
-  void *optimal_src = src, *optimal_dst = dst, *optimal_weight = weight,
-       *optimal_bias = bias;
-  allocate_and_reorder_memory_to_optimal(origin_src_md, src, optimal_src_md,
-                                         optimal_src);
-  allocate_and_reorder_memory_to_optimal(origin_weight_md, weight,
-                                         optimal_weight_md, optimal_weight);
-  allocate_and_reorder_memory_to_optimal(origin_bias_md, bias, optimal_bias_md,
-                                         optimal_bias);
-  if (origin_dst_md != optimal_dst_md) {
-    optimal_dst = allocate(optimal_dst_md);
-  }
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, optimal_src_md,
-             optimal_src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_BIAS, optimal_bias_md,
-             optimal_bias);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, optimal_dst_md,
-             optimal_dst);
-
-  void *cache = nullptr;
-  if (alpha_0 != 1.f) {
-    cache = allocate(optimal_weight_md);
-    _q->memcpy(cache, optimal_weight, optimal_weight_md.get_size());
-    async_scale(alpha_0, optimal_weight_md, cache);
-    insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md,
-               cache);
-    execute_primitive<::dnnl::convolution_forward>(
-        primitive_args,
-        {{1.f, 0.f, DNNL_ARG_DST, optimal_dst_md, optimal_dst}});
-  } else {
-    insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md,
-               optimal_weight);
-    execute_primitive<::dnnl::convolution_forward>(
-        primitive_args,
-        {{1.f, 0.f, DNNL_ARG_DST, optimal_dst_md, optimal_dst}});
-  }
-  if (origin_dst_md != optimal_dst_md) {
-    async_reorder(1.f, optimal_dst_md, optimal_dst, 0.f, origin_dst_md, dst);
-  }
-  async_sum(alpha_1, summand_desc, summand, 1.f, dst_desc, dst);
-  return exit_primitive(
-      async_activation_forward(adesc, 1.f, dst_desc, dst, 0.f, dst_desc, dst));
-}
-
-inline
-sycl::event engine_ext::async_convolution_backward_data(
-    convolution_desc &desc, ::dnnl::algorithm alg, float alpha,
-    const memory_desc_ext &weight_desc, void *weight,
-    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
-    const memory_desc_ext &diff_src_desc, void *diff_src) {
-
-  if (scale_parameter_preprocess({{alpha, beta, diff_dst_desc, diff_dst}})) {
-    return sycl::event();
-  }
-
-  auto help_weight_desc =
-      get_group_weight_desc(desc.get_group_count(), weight_desc);
-
-  auto origin_weight_md = help_weight_desc;
-  auto origin_diff_src_md = diff_src_desc.get_desc();
-  auto origin_diff_dst_md = diff_dst_desc.get_desc();
-  auto diff_src_md = transfer_memory_desc_to_format_tag_any(origin_diff_src_md);
-  auto diff_dst_md = transfer_memory_desc_to_format_tag_any(origin_diff_dst_md);
-  auto weight_md = transfer_memory_desc_to_format_tag_any(origin_weight_md);
-
-  ::dnnl::primitive_attr attr;
-  attr.set_fpmath_mode(desc.get_math_mode());
-
-  auto forward_primitive = create_primitive_desc<::dnnl::convolution_forward>(
-      ::dnnl::prop_kind::forward_training, ::dnnl::algorithm::convolution_auto,
-      diff_src_md, weight_md, diff_dst_md, desc.get_stride(), desc.get_dilate(),
-      desc.get_padding(), desc.get_padding(), attr);
-
-  auto primitive_args =
-      create_primitive_args_or_get<::dnnl::convolution_backward_data>(
-          ::dnnl::algorithm::convolution_auto, diff_src_md, weight_md,
-          diff_dst_md, desc.get_stride(), desc.get_dilate(), desc.get_padding(),
-          desc.get_padding(), forward_primitive, attr);
-
-  auto pd = get_primitive_desc<::dnnl::convolution_backward_data>(
-      primitive_args.second.primitive);
-  auto optimal_diff_src_md = pd.diff_src_desc();
-  auto optimal_diff_dst_md = pd.diff_dst_desc();
-  auto optimal_weight_md = pd.weights_desc();
-
-  enter_primitive(5 * optimal_diff_src_md.get_size() +
-                  optimal_diff_dst_md.get_size() +
-                  optimal_weight_md.get_size());
-
-  void *optimal_diff_src = diff_src, *optimal_diff_dst = diff_dst,
-       *optimal_weight = weight;
-  allocate_and_reorder_memory_to_optimal(origin_diff_dst_md, diff_dst,
-                                         optimal_diff_dst_md, optimal_diff_dst);
-  allocate_and_reorder_memory_to_optimal(origin_weight_md, weight,
-                                         optimal_weight_md, optimal_weight);
-  if (beta == 0.f) {
-    if (origin_diff_src_md != optimal_diff_src_md) {
-      optimal_diff_src = allocate(optimal_diff_src_md);
-    }
-  } else {
-    allocate_and_reorder_memory_to_optimal(
-        origin_diff_src_md, diff_src, optimal_diff_src_md, optimal_diff_src);
-  }
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, optimal_diff_dst_md,
-             optimal_diff_dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_WEIGHTS, optimal_weight_md,
-             optimal_weight);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_SRC, optimal_diff_src_md,
-             optimal_diff_src);
-
-  auto e = execute_primitive<::dnnl::convolution_backward_data>(
-      primitive_args,
-      {{alpha, beta, DNNL_ARG_DIFF_SRC, optimal_diff_src_md, optimal_diff_src}});
-
-  if (origin_diff_src_md != optimal_diff_src_md) {
-    e = async_reorder(1.f, optimal_diff_src_md, optimal_diff_src, 0.f,
-                      origin_diff_src_md, diff_src);
-  }
-  return exit_primitive(e);
-}
-
-inline
-sycl::event engine_ext::async_convolution_backward_weight(
-    convolution_desc &desc, ::dnnl::algorithm alg, float alpha,
-    const memory_desc_ext &src_desc, void *src,
-    const memory_desc_ext &diff_dst_desc, void *diff_dst, float beta,
-    const memory_desc_ext &diff_weight_desc, void *diff_weight) {
-
-  if (scale_parameter_preprocess(
-          {{alpha, beta, diff_weight_desc, diff_weight}})) {
-    return sycl::event();
-  }
-
-  auto help_diff_weight_desc =
-      get_group_weight_desc(desc.get_group_count(), diff_weight_desc);
-
-  ::dnnl::primitive_attr attr;
-  attr.set_fpmath_mode(desc.get_math_mode());
-
-  auto origin_diff_weight_md = help_diff_weight_desc;
-  auto origin_src_md = src_desc.get_desc();
-  auto origin_diff_dst_md = diff_dst_desc.get_desc();
-  auto src_md = transfer_memory_desc_to_format_tag_any(origin_src_md);
-  auto diff_dst_md = transfer_memory_desc_to_format_tag_any(origin_diff_dst_md);
-  auto diff_weight_md =
-      transfer_memory_desc_to_format_tag_any(origin_diff_weight_md);
-
-  auto forward_primitive = create_primitive_desc<::dnnl::convolution_forward>(
-      ::dnnl::prop_kind::forward_training, ::dnnl::algorithm::convolution_auto,
-      src_md, diff_weight_md, diff_dst_md, desc.get_stride(), desc.get_dilate(),
-      desc.get_padding(), desc.get_padding(), attr);
-
-  auto primitive_args =
-      create_primitive_args_or_get<::dnnl::convolution_backward_weights>(
-          ::dnnl::algorithm::convolution_auto, src_md, diff_weight_md,
-          diff_dst_md, desc.get_stride(), desc.get_dilate(), desc.get_padding(),
-          desc.get_padding(), forward_primitive, attr);
-
-  auto pd = get_primitive_desc<::dnnl::convolution_backward_weights>(
-      primitive_args.second.primitive);
-  auto optimal_src_md = pd.src_desc();
-  auto optimal_diff_dst_md = pd.diff_dst_desc();
-  auto optimal_diff_weight_md = pd.diff_weights_desc();
-
-  enter_primitive(optimal_diff_weight_md.get_size() * 5 +
-                  optimal_diff_dst_md.get_size() + optimal_src_md.get_size());
-
-  void *optimal_src = src, *optimal_diff_dst = diff_dst,
-       *optimal_diff_weight = diff_weight;
-  allocate_and_reorder_memory_to_optimal(origin_diff_dst_md, diff_dst,
-                                         optimal_diff_dst_md, optimal_diff_dst);
-  allocate_and_reorder_memory_to_optimal(origin_src_md, src, optimal_src_md,
-                                         optimal_src);
-  if (beta == 0.f) {
-    if (origin_diff_weight_md != optimal_diff_weight_md) {
-      optimal_diff_weight = allocate(optimal_diff_weight_md);
-    }
-  } else {
-    allocate_and_reorder_memory_to_optimal(origin_diff_weight_md, diff_weight,
-                                           optimal_diff_weight_md,
-                                           optimal_diff_weight);
-  }
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC, optimal_src_md,
-             optimal_src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_DST, optimal_diff_dst_md,
-             optimal_diff_dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DIFF_WEIGHTS,
-             optimal_diff_weight_md, optimal_diff_weight);
-
-  auto e = execute_primitive<::dnnl::convolution_backward_weights>(
-      primitive_args, {{alpha, beta, DNNL_ARG_DIFF_WEIGHTS,
-                        optimal_diff_weight_md, optimal_diff_weight}});
-
-  if (origin_diff_weight_md != optimal_diff_weight_md) {
-    e = async_reorder(1.f, optimal_diff_weight_md, optimal_diff_weight, 0.f,
-                      origin_diff_weight_md, diff_weight);
-  }
-  return exit_primitive(e);
-}
-
-inline
-sycl::event engine_ext::async_convolution_backward_bias(
-    float alpha, const memory_desc_ext &diff_dst_desc, void *diff_dst,
-    float beta, const memory_desc_ext &diff_bias_desc, void *diff_bias) {
-  return async_reduction(reduction_op::sum, alpha, diff_dst_desc, diff_dst, beta,
-                   diff_bias_desc, diff_bias);
-}
-
-inline
-void engine_ext::rnn_get_weight_space_size(const rnn_desc &desc,
-                                           size_t *weight_space_size) {
-  *weight_space_size = 0;
-  rnn_forward_internal(desc, ::dnnl::prop_kind::forward_inference,
-                       memory_desc_ext(), nullptr, memory_desc_ext(), nullptr,
-                       memory_desc_ext(), nullptr, nullptr, memory_desc_ext(),
-                       nullptr, nullptr, 0, nullptr, 0, nullptr, 0, nullptr, true,
-                       weight_space_size, nullptr, nullptr);
-  return;
-}
-
-inline
-void engine_ext::rnn_get_scratchpad_workspace_size(
-    const rnn_desc &desc, ::dnnl::prop_kind kind,
-    const memory_desc_ext &src_desc, size_t *scratchpad_size,
-    size_t *workspace_size) {
-  *workspace_size = 0;
-  *scratchpad_size = 0;
-  rnn_forward_internal(desc, kind, src_desc, nullptr, memory_desc_ext(),
-                       nullptr, memory_desc_ext(), nullptr, nullptr,
-                       memory_desc_ext(), nullptr, nullptr, 0, nullptr, 0,
-                       nullptr, 0, nullptr, true, nullptr, workspace_size,
-                       scratchpad_size);
-  return;
-}
-
-inline
-sycl::event engine_ext::async_rnn_forward(
-    const rnn_desc &desc, ::dnnl::prop_kind kind,
-    const memory_desc_ext &src_desc, void *src, const memory_desc_ext &dst_desc,
-    void *dst, const memory_desc_ext &iter_desc, void *src_iter, void *dst_iter,
-    const memory_desc_ext &iter_c_desc, void *src_iter_c, void *dst_iter_c,
-    size_t weight_size, void *weight, size_t scratchpad_size, void *scratchpad,
-    size_t workspace_size, void *workspace) {
-
-  return rnn_forward_internal(
-      desc, kind, src_desc, src, dst_desc, dst, iter_desc, src_iter, dst_iter,
-      iter_c_desc, src_iter_c, dst_iter_c, weight_size, weight, workspace_size,
-      workspace, scratchpad_size, scratchpad, false, nullptr, nullptr,
-      nullptr);
-}
-
-inline
-sycl::event engine_ext::async_rnn_backward(
-    const rnn_desc &desc, const memory_desc_ext &dst_desc, void *dst,
-    void *diff_dst, const memory_desc_ext &src_desc, void *src, void *diff_src,
-    const memory_desc_ext &iter_desc, void *src_iter, void *diff_dst_iter,
-    void *diff_src_iter, const memory_desc_ext &iter_c_desc, void *src_iter_c,
-    void *diff_dst_iter_c, void *diff_src_iter_c, size_t weight_size,
-    void *weight, void *diff_weight, size_t scratchpad_size, void *scratchpad,
-    size_t workspace_size, void *workspace) {
-  ::dnnl::memory::data_type src_dt;
-  ::dnnl::memory::format_tag src_format_tag;
-  rnn_mode mode;
-  rnn_memory_format_tag format_tag;
-  rnn_bias_mode bias_mode;
-  rnn_direction direction;
-  dpct::library_data_t dt;
-  int direction_num = 1, input_size = 0, hidden_size = 0, projection_size = 0,
-      layer_size = 0, gate_num = 1, output_size = 0, data_type_size = 0,
-      seq_length = 1, batch_size = 1;
-  void *last_layer_cache = nullptr;
-  void *hidden_layer_cache = nullptr;
-  sycl::event e;
-  enter_primitive(src_desc.get_size() * 2);
-  std::vector<int> offset(9, 0);
-  std::vector<void *> data = {
-      src,
-      dst,
-      (uint8_t *)src_iter + iter_desc.get_size(),
-      nullptr,
-      (uint8_t *)src_iter_c + iter_c_desc.get_size(),
-      nullptr,
-      (uint8_t *)weight + weight_size,
-      (uint8_t *)workspace + workspace_size,
-      diff_src,
-      diff_dst,
-      (uint8_t *)diff_src_iter + iter_desc.get_size(),
-      (uint8_t *)diff_dst_iter + iter_desc.get_size(),
-      (uint8_t *)diff_src_iter_c + iter_c_desc.get_size(),
-      (uint8_t *)diff_dst_iter_c + iter_c_desc.get_size(),
-      (uint8_t *)diff_weight + weight_size,
-      scratchpad};
-
-  desc.get(&mode, &bias_mode, &direction, &dt, &input_size, &hidden_size,
-           &projection_size, &layer_size);
-
-  get_rnn_configuration(src_desc.get_desc(), direction, mode, dt, hidden_size,
-                        &src_dt, &src_format_tag, &projection_size,
-                        &output_size, &seq_length, &batch_size, &direction_num,
-                        &gate_num);
-
-  if (direction == rnn_direction::bidirectional) {
-    if (layer_size > 1) {
-      last_layer_cache = allocate(src_desc);
-      hidden_layer_cache = allocate(src_desc);
-      data[8] = last_layer_cache;
-    }
-    e = execute_rnn_backward_primitive(
-        mode, ::dnnl::rnn_direction::bidirectional_concat, bias_mode, src_dt,
-        src_format_tag, seq_length, batch_size, output_size, 2 * output_size, 1,
-        direction_num, hidden_size, gate_num, projection_size, data, offset, 1);
-    if (layer_size > 1) {
-      data[8] = hidden_layer_cache;
-      data[9] = last_layer_cache;
-      e = execute_rnn_backward_primitive(
-          mode, ::dnnl::rnn_direction::bidirectional_sum, bias_mode, src_dt,
-          src_format_tag, seq_length, batch_size, output_size, output_size, 1,
-          direction_num, hidden_size, gate_num, projection_size, data, offset,
-          layer_size - 1);
-      _q->memcpy(diff_src,
-                 ((layer_size - 1) % 2 == 0) ? last_layer_cache
-                                             : hidden_layer_cache,
-                 src_desc.get_size());
-    }
-  } else {
-    e = execute_rnn_backward_primitive(
-        mode, ::dnnl::rnn_direction::unidirectional_left2right, bias_mode,
-        src_dt, src_format_tag, seq_length, batch_size, output_size,
-        output_size, layer_size, direction_num, hidden_size, gate_num,
-        projection_size, data, offset, 1);
-  }
-
-  return exit_primitive(e);
-}
-
-inline
-size_t engine_ext::get_dropout_state_size(){
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                           "Interfaces Project does not support this API.");
-#else
-  auto r = get_internal_resource(_q);
-  if(r->random_engine_state_size == -1){
-    auto rand_engine = rng_engine_t(*_q, 0);
-    r->random_engine_state_size =
-        oneapi::mkl::rng::get_state_size(rand_engine);
-  }
-  return r->random_engine_state_size;
-#endif
-}
-
-inline size_t
-engine_ext::get_dropout_workspace_size(const memory_desc_ext &src_desc) {
-  return src_desc.get_size();
-}
-
-inline
-sycl::event engine_ext::async_dropout_forward(dropout_desc &desc,
-                                              const memory_desc_ext &src_desc,
-                                              void *src,
-                                              const memory_desc_ext &dst_desc,
-                                              void *dst, void *workspace,
-                                              size_t workspace_size) {
-  if (workspace_size < src_desc.get_size()) {
-    throw std::runtime_error("async_dropout_forward: no sufficient workspace.");
-  }
-  enter_primitive(src_desc.get_size() * 2 + dst_desc.get_size() * 2);
-  float p = desc.get_probability();
-  if (p == 1.f) {
-    return _q->memset(dst, 0, dst_desc.get_size());
-  } else if (p == 0.f) {
-    return async_reorder(1.f, src_desc, src, 0.f, dst_desc, dst);
-  }
-
-  float scale_factor = 1.f / (1.f - p);
-  void *cache = workspace;
-
-  memory_desc_ext rng_data_desc(
-      ::dnnl::memory::desc(src_desc.get_dims(), ::dnnl::memory::data_type::s32,
-                           src_desc.get_strides()));
-  if (src_desc.get_desc().get_data_type() != ::dnnl::memory::data_type::s32) {
-    cache = allocate(rng_data_desc);
-  }
-
-  desc.generate(_q, get_dropout_state_size(), rng_data_desc.get_element_num(),
-                (std::int32_t *)cache);
-
-  if (cache == workspace) {
-    async_scale(scale_factor, src_desc, workspace);
-  } else {
-    async_reorder(scale_factor, rng_data_desc, cache, 0.f, src_desc, workspace);
-  }
-
-  auto primitive_args = create_primitive_args_or_get<::dnnl::binary>(
-      ::dnnl::algorithm::binary_mul, src_desc.get_desc(), src_desc.get_desc(),
-      dst_desc.get_desc());
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_0, src_desc.get_desc(),
-             src);
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_1, src_desc.get_desc(),
-             workspace);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, dst_desc.get_desc(),
-             dst);
-
-  return exit_primitive(execute_primitive<::dnnl::binary>(primitive_args));
-}
-
-inline
-sycl::event engine_ext::async_dropout_backward(
-    dropout_desc &desc, const memory_desc_ext &diff_dst_desc,
-    void *diff_dst, const memory_desc_ext &diff_src_desc, void *diff_src,
-    void *workspace, size_t workspace_size) {
-  enter_primitive(2 * diff_src_desc.get_size());
-  float p = desc.get_probability();
-  if (p == 1.f) {
-    return _q->memset(diff_src, 0, diff_src_desc.get_size());
-  } else if (p == 0.f) {
-    return async_reorder(1.f, diff_dst_desc, diff_dst, 0.f, diff_src_desc,
-                         diff_src);
-  }
-
-  auto primitive_args = create_primitive_args_or_get<::dnnl::binary>(
-      ::dnnl::algorithm::binary_mul, diff_dst_desc.get_desc(),
-      diff_dst_desc.get_desc(), diff_src_desc.get_desc());
-
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_0,
-             diff_dst_desc.get_desc(), diff_dst);
-  insert_arg(primitive_args.second.args, DNNL_ARG_SRC_1,
-             diff_dst_desc.get_desc(), workspace);
-  insert_arg(primitive_args.second.args, DNNL_ARG_DST, diff_src_desc.get_desc(),
-             diff_src);
-
-  return exit_primitive(execute_primitive<::dnnl::binary>(primitive_args));
-}
-} // namespace dnnl
-} // namespace dpct
-
-#endif // __DPCT_DNNL_UTILS_HPP__
diff --git a/dpct/dpct.hpp b/dpct/dpct.hpp
deleted file mode 100644
index 8cc312f0ea31d..0000000000000
--- a/dpct/dpct.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-//==---- dpct.hpp ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_HPP__
-#define __DPCT_HPP__
-
-#include <sycl/sycl.hpp>
-#include <iostream>
-#include <limits.h>
-#include <math.h>
-
-template <class... Args> class dpct_kernel_name;
-template <int Arg> class dpct_kernel_scalar;
-
-#include "atomic.hpp"
-#include "device.hpp"
-#include "image.hpp"
-#include "kernel.hpp"
-#include "math.hpp"
-#include "memory.hpp"
-#include "util.hpp"
-
-#if defined(_MSC_VER)
-#define __dpct_align__(n) __declspec(align(n))
-#define __dpct_inline__ __forceinline
-#else
-#define __dpct_align__(n) __attribute__((aligned(n)))
-#define __dpct_inline__ __inline__ __attribute__((always_inline))
-#endif
-
-#if defined(_MSC_VER)
-#define __dpct_noinline__ __declspec(noinline)
-#else
-#define __dpct_noinline__ __attribute__((noinline))
-#endif
-
-#define DPCT_COMPATIBILITY_TEMP (900)
-
-namespace dpct{
-enum error_code { success = 0, default_error = 999 };
-}
-
-#define DPCT_CHECK_ERROR(expr)                                                 \
-  [&]() {                                                                      \
-    try {                                                                      \
-      expr;                                                                    \
-      return dpct::success;                                                    \
-    } catch (std::exception const &e) {                                        \
-      std::cerr << e.what() << std::endl;                                      \
-      return dpct::default_error;                                              \
-    }                                                                          \
-  }()
-
-#define DPCT_PI_F (3.14159274101257f)
-#define DPCT_PI (3.141592653589793115998)
-
-#endif // __DPCT_HPP__
diff --git a/dpct/dpl_extras/algorithm.h b/dpct/dpl_extras/algorithm.h
deleted file mode 100644
index 7c98b7a2282f9..0000000000000
--- a/dpct/dpl_extras/algorithm.h
+++ /dev/null
@@ -1,2419 +0,0 @@
-//==---- algorithm.h ------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_ALGORITHM_H__
-#define __DPCT_ALGORITHM_H__
-
-#include <oneapi/dpl/algorithm>
-#include <oneapi/dpl/execution>
-#include <oneapi/dpl/numeric>
-
-#include "functional.h"
-#include "iterators.h"
-#include "vector.h"
-
-namespace dpct {
-
-template <typename Policy, typename Iter1, typename Iter2, typename Pred,
-          typename T>
-void replace_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p,
-                const T &new_value) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  std::transform(
-      std::forward<Policy>(policy), first, last, mask, first,
-      internal::replace_if_fun<typename std::iterator_traits<Iter1>::value_type,
-                               Pred>(p, new_value));
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Pred, typename T>
-Iter3 replace_copy_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
-                      Iter3 result, Pred p, const T &new_value) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  return std::transform(
-      std::forward<Policy>(policy), first, last, mask, result,
-      internal::replace_if_fun<typename std::iterator_traits<Iter3>::value_type,
-                               Pred>(p, new_value));
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Pred>
-internal::enable_if_hetero_execution_policy<Policy, Iter1>
-remove_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using oneapi::dpl::make_zip_iterator;
-  using policy_type = typename std::decay<Policy>::type;
-  using internal::__buffer;
-  using ValueType = typename std::iterator_traits<Iter1>::value_type;
-
-  __buffer<ValueType> _tmp(std::distance(first, last));
-
-  auto end = std::copy_if(
-      policy, make_zip_iterator(first, mask),
-      make_zip_iterator(last, mask + std::distance(first, last)),
-      make_zip_iterator(_tmp.get(), oneapi::dpl::discard_iterator()),
-      internal::negate_predicate_key_fun<Pred>(p));
-  return std::copy(std::forward<Policy>(policy), _tmp.get(),
-                   std::get<0>(end.base()), first);
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Pred>
-typename std::enable_if<!internal::is_hetero_execution_policy<
-                            typename std::decay<Policy>::type>::value,
-                        Iter1>::type
-remove_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using oneapi::dpl::make_zip_iterator;
-  using policy_type = typename std::decay<Policy>::type;
-  using ValueType = typename std::iterator_traits<Iter1>::value_type;
-
-  std::vector<ValueType> _tmp(std::distance(first, last));
-
-  auto end = std::copy_if(
-      policy, make_zip_iterator(first, mask),
-      make_zip_iterator(last, mask + std::distance(first, last)),
-      make_zip_iterator(_tmp.begin(), oneapi::dpl::discard_iterator()),
-      internal::negate_predicate_key_fun<Pred>(p));
-  return std::copy(policy, _tmp.begin(), std::get<0>(end.base()), first);
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Pred>
-Iter3 remove_copy_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
-                     Iter3 result, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using oneapi::dpl::make_zip_iterator;
-  auto ret_val = std::remove_copy_if(
-      std::forward<Policy>(policy), make_zip_iterator(first, mask),
-      make_zip_iterator(last, mask + std::distance(first, last)),
-      make_zip_iterator(result, oneapi::dpl::discard_iterator()),
-      internal::predicate_key_fun<Pred>(p));
-  return std::get<0>(ret_val.base());
-}
-
-template <class Policy, class Iter1, class Iter2, class BinaryPred>
-std::pair<Iter1, Iter2> unique(Policy &&policy, Iter1 keys_first,
-                               Iter1 keys_last, Iter2 values_first,
-                               BinaryPred binary_pred) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::unique(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first, values_first),
-      oneapi::dpl::make_zip_iterator(
-          keys_last, values_first + std::distance(keys_first, keys_last)),
-      internal::compare_key_fun<BinaryPred>(binary_pred));
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_first, values_first), ret_val);
-  return std::make_pair(keys_first + n1, values_first + n1);
-}
-
-template <class Policy, class Iter1, class Iter2>
-std::pair<Iter1, Iter2> unique(Policy &&policy, Iter1 keys_first,
-                               Iter1 keys_last, Iter2 values_first) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using T = typename std::iterator_traits<Iter1>::value_type;
-  return unique(std::forward<Policy>(policy), keys_first, keys_last,
-                values_first, std::equal_to<T>());
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class BinaryPred>
-std::pair<Iter3, Iter4> unique_copy(Policy &&policy, Iter1 keys_first,
-                                    Iter1 keys_last, Iter2 values_first,
-                                    Iter3 keys_result, Iter4 values_result,
-                                    BinaryPred binary_pred) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::unique_copy(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first, values_first),
-      oneapi::dpl::make_zip_iterator(
-          keys_last, values_first + std::distance(keys_first, keys_last)),
-      oneapi::dpl::make_zip_iterator(keys_result, values_result),
-      internal::unique_fun<BinaryPred>(binary_pred));
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
-  return std::make_pair(keys_result + n1, values_result + n1);
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4>
-std::pair<Iter3, Iter4> unique_copy(Policy &&policy, Iter1 keys_first,
-                                    Iter1 keys_last, Iter2 values_first,
-                                    Iter3 keys_result, Iter4 values_result) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using T = typename std::iterator_traits<Iter1>::value_type;
-  auto comp = std::equal_to<T>();
-  return unique_copy(std::forward<Policy>(policy), keys_first, keys_last,
-                     values_first, keys_result, values_result, comp);
-}
-
-template <typename Policy, typename Iter, typename Pred>
-Iter partition_point(Policy &&policy, Iter first, Iter last, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
-                   std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  if (std::is_partitioned(policy, first, last, p))
-    return std::find_if_not(std::forward<Policy>(policy), first, last, p);
-  else
-    return first;
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Pred>
-Iter3 copy_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
-              Iter3 result, Pred pred) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::copy_if(
-      std::forward<Policy>(policy), oneapi::dpl::make_zip_iterator(first, mask),
-      oneapi::dpl::make_zip_iterator(last, mask + std::distance(first, last)),
-      oneapi::dpl::make_zip_iterator(result, oneapi::dpl::discard_iterator()),
-      internal::predicate_key_fun<Pred>(pred));
-  return std::get<0>(ret_val.base());
-}
-
-template <class Policy, class Iter1, class Iter2, class UnaryOperation,
-          class Pred>
-Iter2 transform_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 result,
-                   UnaryOperation unary_op, Pred pred) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using T = typename std::iterator_traits<Iter1>::value_type;
-  const auto n = std::distance(first, last);
-  std::for_each(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(first, result),
-      oneapi::dpl::make_zip_iterator(first, result) + n,
-      internal::transform_if_fun<T, Pred, UnaryOperation>(pred, unary_op));
-  return result + n;
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3,
-          class UnaryOperation, class Pred>
-Iter3 transform_if(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
-                   Iter3 result, UnaryOperation unary_op, Pred pred) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using T = typename std::iterator_traits<Iter1>::value_type;
-  using Ref1 = typename std::iterator_traits<Iter1>::reference;
-  using Ref2 = typename std::iterator_traits<Iter2>::reference;
-  const auto n = std::distance(first, last);
-  std::for_each(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(first, mask, result),
-      oneapi::dpl::make_zip_iterator(first, mask, result) + n,
-      internal::transform_if_unary_zip_mask_fun<T, Pred, UnaryOperation>(
-          pred, unary_op));
-  return result + n;
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class BinaryOperation, class Pred>
-Iter4 transform_if(Policy &&policy, Iter1 first1, Iter1 last1, Iter2 first2,
-                   Iter3 mask, Iter4 result, BinaryOperation binary_op,
-                   Pred pred) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  const auto n = std::distance(first1, last1);
-  using ZipIterator =
-      typename oneapi::dpl::zip_iterator<Iter1, Iter2, Iter3, Iter4>;
-  using T = typename std::iterator_traits<ZipIterator>::value_type;
-  std::for_each(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(first1, first2, mask, result),
-      oneapi::dpl::make_zip_iterator(last1, first2 + n, mask + n, result + n),
-      internal::transform_if_zip_mask_fun<T, Pred, BinaryOperation>(pred,
-                                                                    binary_op));
-  return result + n;
-}
-
-template <typename Policy, typename InputIter1, typename InputIter2,
-          typename OutputIter>
-void scatter(Policy &&policy, InputIter1 first, InputIter1 last, InputIter2 map,
-             OutputIter result) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<InputIter2>::iterator_category,
-              std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<OutputIter>::iterator_category,
-              std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  oneapi::dpl::copy(policy, first, last,
-                    oneapi::dpl::make_permutation_iterator(result, map));
-}
-
-template <typename Policy, typename InputIter1, typename InputIter2,
-          typename OutputIter>
-OutputIter gather(Policy &&policy, InputIter1 map_first, InputIter1 map_last,
-                  InputIter2 input_first, OutputIter result) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<InputIter2>::iterator_category,
-              std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<OutputIter>::iterator_category,
-              std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto perm_begin =
-      oneapi::dpl::make_permutation_iterator(input_first, map_first);
-  const int n = ::std::distance(map_first, map_last);
-
-  return oneapi::dpl::copy(policy, perm_begin, perm_begin + n, result);
-}
-
-template <typename Policy, typename InputIter1, typename InputIter2,
-          typename InputIter3, typename OutputIter, typename Predicate>
-void scatter_if(Policy &&policy, InputIter1 first, InputIter1 last,
-                InputIter2 map, InputIter3 mask, OutputIter result,
-                Predicate pred) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<InputIter2>::iterator_category,
-              std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<InputIter3>::iterator_category,
-              std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<OutputIter>::iterator_category,
-              std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  transform_if(
-      ::std::forward<Policy>(policy), first, last, mask,
-      oneapi::dpl::make_permutation_iterator(result, map),
-      [=](auto &&v) { return v; }, [=](auto &&m) { return pred(m); });
-}
-
-template <typename Policy, typename InputIter1, typename InputIter2,
-          typename InputIter3, typename OutputIter>
-void scatter_if(Policy &&policy, InputIter1 first, InputIter1 last,
-                InputIter2 map, InputIter3 mask, OutputIter result) {
-  scatter_if(::std::forward<Policy>(policy), first, last, map, mask, result,
-             internal::no_op_fun());
-}
-
-template <typename Policy, typename InputIter1, typename InputIter2,
-          typename InputIter3, typename OutputIter, typename Predicate>
-OutputIter gather_if(Policy &&policy, InputIter1 map_first, InputIter1 map_last,
-                     InputIter2 mask, InputIter3 input_first, OutputIter result,
-                     Predicate pred) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<InputIter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<InputIter2>::iterator_category,
-              std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<InputIter3>::iterator_category,
-              std::random_access_iterator_tag>::value &&
-          std::is_same<
-              typename std::iterator_traits<OutputIter>::iterator_category,
-              std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto perm_begin =
-      oneapi::dpl::make_permutation_iterator(input_first, map_first);
-  const int n = std::distance(map_first, map_last);
-
-  return transform_if(
-      ::std::forward<Policy>(policy), perm_begin, perm_begin + n, mask, result,
-      [=](auto &&v) { return v; }, [=](auto &&m) { return pred(m); });
-}
-
-template <typename Policy, typename InputIter1, typename InputIter2,
-          typename InputIter3, typename OutputIter>
-OutputIter gather_if(Policy &&policy, InputIter1 map_first, InputIter1 map_last,
-                     InputIter2 mask, InputIter3 input_first,
-                     OutputIter result) {
-  return gather_if(::std::forward<Policy>(policy), map_first, map_last, mask,
-                   input_first, result, internal::no_op_fun());
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4, typename Iter5, typename Iter6>
-std::pair<Iter5, Iter6>
-merge(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, Iter2 keys_first2,
-      Iter2 keys_last2, Iter3 values_first1, Iter4 values_first2,
-      Iter5 keys_result, Iter6 values_result) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto n1 = std::distance(keys_first1, keys_last1);
-  auto n2 = std::distance(keys_first2, keys_last2);
-  std::merge(std::forward<Policy>(policy),
-             oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-             oneapi::dpl::make_zip_iterator(keys_last1, values_first1 + n1),
-             oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
-             oneapi::dpl::make_zip_iterator(keys_last2, values_first2 + n2),
-             oneapi::dpl::make_zip_iterator(keys_result, values_result),
-             internal::compare_key_fun<>());
-  return std::make_pair(keys_result + n1 + n2, values_result + n1 + n2);
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4, typename Iter5, typename Iter6, typename Comp>
-std::pair<Iter5, Iter6>
-merge(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1, Iter2 keys_first2,
-      Iter2 keys_last2, Iter3 values_first1, Iter4 values_first2,
-      Iter5 keys_result, Iter6 values_result, Comp comp) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto n1 = std::distance(keys_first1, keys_last1);
-  auto n2 = std::distance(keys_first2, keys_last2);
-  std::merge(std::forward<Policy>(policy),
-             oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-             oneapi::dpl::make_zip_iterator(keys_last1, values_first1 + n1),
-             oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
-             oneapi::dpl::make_zip_iterator(keys_last2, values_first2 + n2),
-             oneapi::dpl::make_zip_iterator(keys_result, values_result),
-             internal::compare_key_fun<Comp>(comp));
-  return std::make_pair(keys_result + n1 + n2, values_result + n1 + n2);
-}
-
-template <class Policy, class Iter, class T>
-void iota(Policy &&policy, Iter first, Iter last, T init, T step) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
-                   std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using DiffSize = typename std::iterator_traits<Iter>::difference_type;
-  std::transform(
-      std::forward<Policy>(policy), oneapi::dpl::counting_iterator<DiffSize>(0),
-      oneapi::dpl::counting_iterator<DiffSize>(std::distance(first, last)),
-      first, internal::sequence_fun<T>(init, step));
-}
-
-template <class Policy, class Iter, class T>
-void iota(Policy &&policy, Iter first, Iter last, T init) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
-                   std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  iota(std::forward<Policy>(policy), first, last, init, T(1));
-}
-
-template <class Policy, class Iter>
-void iota(Policy &&policy, Iter first, Iter last) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
-                   std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using DiffSize = typename std::iterator_traits<Iter>::difference_type;
-  iota(std::forward<Policy>(policy), first, last, DiffSize(0), DiffSize(1));
-}
-
-template <class Policy, class Iter1, class Iter2, class Comp>
-void sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last,
-          Iter2 values_first, Comp comp) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto first = oneapi::dpl::make_zip_iterator(keys_first, values_first);
-  auto last = first + std::distance(keys_first, keys_last);
-  std::sort(std::forward<Policy>(policy), first, last,
-            internal::compare_key_fun<Comp>(comp));
-}
-
-template <class Policy, class Iter1, class Iter2>
-void sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last,
-          Iter2 values_first) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  sort(std::forward<Policy>(policy), keys_first, keys_last, values_first,
-       internal::__less());
-}
-
-template <class Policy, class Iter1, class Iter2, class Comp>
-void stable_sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last,
-                 Iter2 values_first, Comp comp) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  std::stable_sort(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first, values_first),
-      oneapi::dpl::make_zip_iterator(
-          keys_last, values_first + std::distance(keys_first, keys_last)),
-      internal::compare_key_fun<Comp>(comp));
-}
-
-template <class Policy, class Iter1, class Iter2>
-void stable_sort(Policy &&policy, Iter1 keys_first, Iter1 keys_last,
-                 Iter2 values_first) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  stable_sort(std::forward<Policy>(policy), keys_first, keys_last, values_first,
-              internal::__less());
-}
-
-template <class Policy, class Iter, class Operator>
-void for_each_index(Policy &&policy, Iter first, Iter last, Operator unary_op) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter>::iterator_category,
-                   std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  using DiffSize = typename std::iterator_traits<Iter>::difference_type;
-  std::transform(
-      std::forward<Policy>(policy), oneapi::dpl::counting_iterator<DiffSize>(0),
-      oneapi::dpl::counting_iterator<DiffSize>(std::distance(first, last)),
-      first, unary_op);
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class Iter5>
-std::pair<Iter4, Iter5>
-set_intersection(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
-                 Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
-                 Iter4 keys_result, Iter5 values_result) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::set_intersection(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-      oneapi::dpl::make_zip_iterator(
-          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
-      oneapi::dpl::make_zip_iterator(keys_first2,
-                                     oneapi::dpl::discard_iterator()),
-      oneapi::dpl::make_zip_iterator(keys_last2,
-                                     oneapi::dpl::discard_iterator()),
-      oneapi::dpl::make_zip_iterator(keys_result, values_result),
-      internal::compare_key_fun<>());
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
-  return std::make_pair(keys_result + n1, values_result + n1);
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class Iter5, class Comp>
-std::pair<Iter4, Iter5>
-set_intersection(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
-                 Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
-                 Iter4 keys_result, Iter5 values_result, Comp comp) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::set_intersection(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-      oneapi::dpl::make_zip_iterator(
-          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
-      oneapi::dpl::make_zip_iterator(keys_first2,
-                                     oneapi::dpl::discard_iterator()),
-      oneapi::dpl::make_zip_iterator(keys_last2,
-                                     oneapi::dpl::discard_iterator()),
-      oneapi::dpl::make_zip_iterator(keys_result, values_result),
-      internal::compare_key_fun<Comp>(comp));
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
-  return std::make_pair(keys_result + n1, values_result + n1);
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class Iter5, class Iter6>
-std::pair<Iter5, Iter6>
-set_symmetric_difference(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
-                         Iter2 keys_first2, Iter2 keys_last2,
-                         Iter3 values_first1, Iter4 values_first2,
-                         Iter5 keys_result, Iter6 values_result) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::set_symmetric_difference(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-      oneapi::dpl::make_zip_iterator(
-          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
-      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
-      oneapi::dpl::make_zip_iterator(
-          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
-      oneapi::dpl::make_zip_iterator(keys_result, values_result),
-      internal::compare_key_fun<>());
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
-  return std::make_pair(keys_result + n1, values_result + n1);
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class Iter5, class Iter6, class Comp>
-std::pair<Iter5, Iter6>
-set_symmetric_difference(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
-                         Iter2 keys_first2, Iter2 keys_last2,
-                         Iter3 values_first1, Iter4 values_first2,
-                         Iter5 keys_result, Iter6 values_result, Comp comp) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::set_symmetric_difference(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-      oneapi::dpl::make_zip_iterator(
-          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
-      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
-      oneapi::dpl::make_zip_iterator(
-          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
-      oneapi::dpl::make_zip_iterator(keys_result, values_result),
-      internal::compare_key_fun<Comp>(comp));
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
-  return std::make_pair(keys_result + n1, values_result + n1);
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class Iter5, class Iter6>
-std::pair<Iter5, Iter6>
-set_difference(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
-               Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
-               Iter4 values_first2, Iter5 keys_result, Iter6 values_result) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::set_difference(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-      oneapi::dpl::make_zip_iterator(
-          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
-      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
-      oneapi::dpl::make_zip_iterator(
-          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
-      oneapi::dpl::make_zip_iterator(keys_result, values_result),
-      internal::compare_key_fun<>());
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
-  return std::make_pair(keys_result + n1, values_result + n1);
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class Iter5, class Iter6, class Comp>
-std::pair<Iter5, Iter6> set_difference(Policy &&policy, Iter1 keys_first1,
-                                       Iter1 keys_last1, Iter2 keys_first2,
-                                       Iter2 keys_last2, Iter3 values_first1,
-                                       Iter4 values_first2, Iter5 keys_result,
-                                       Iter6 values_result, Comp comp) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::set_difference(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-      oneapi::dpl::make_zip_iterator(
-          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
-      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
-      oneapi::dpl::make_zip_iterator(
-          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
-      oneapi::dpl::make_zip_iterator(keys_result, values_result),
-      internal::compare_key_fun<Comp>(comp));
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
-  return std::make_pair(keys_result + n1, values_result + n1);
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class Iter5, class Iter6>
-internal::enable_if_execution_policy<Policy, std::pair<Iter5, Iter6>>
-set_union(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
-          Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
-          Iter4 values_first2, Iter5 keys_result, Iter6 values_result) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::set_union(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-      oneapi::dpl::make_zip_iterator(
-          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
-      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
-      oneapi::dpl::make_zip_iterator(
-          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
-      oneapi::dpl::make_zip_iterator(keys_result, values_result),
-      internal::compare_key_fun<>());
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
-  return std::make_pair(keys_result + n1, values_result + n1);
-}
-
-template <class Policy, class Iter1, class Iter2, class Iter3, class Iter4,
-          class Iter5, class Iter6, class Comp>
-internal::enable_if_execution_policy<Policy, std::pair<Iter5, Iter6>>
-set_union(Policy &&policy, Iter1 keys_first1, Iter1 keys_last1,
-          Iter2 keys_first2, Iter2 keys_last2, Iter3 values_first1,
-          Iter4 values_first2, Iter5 keys_result, Iter6 values_result,
-          Comp comp) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter5>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter6>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::set_union(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(keys_first1, values_first1),
-      oneapi::dpl::make_zip_iterator(
-          keys_last1, values_first1 + std::distance(keys_first1, keys_last1)),
-      oneapi::dpl::make_zip_iterator(keys_first2, values_first2),
-      oneapi::dpl::make_zip_iterator(
-          keys_last2, values_first2 + std::distance(keys_first2, keys_last2)),
-      oneapi::dpl::make_zip_iterator(keys_result, values_result),
-      internal::compare_key_fun<Comp>(comp));
-  auto n1 = std::distance(
-      oneapi::dpl::make_zip_iterator(keys_result, values_result), ret_val);
-  return std::make_pair(keys_result + n1, values_result + n1);
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4, typename Pred>
-internal::enable_if_execution_policy<Policy, std::pair<Iter3, Iter4>>
-stable_partition_copy(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
-                      Iter3 out_true, Iter4 out_false, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  auto ret_val = std::partition_copy(
-      std::forward<Policy>(policy), oneapi::dpl::make_zip_iterator(first, mask),
-      oneapi::dpl::make_zip_iterator(last, mask + std::distance(first, last)),
-      oneapi::dpl::make_zip_iterator(out_true, oneapi::dpl::discard_iterator()),
-      oneapi::dpl::make_zip_iterator(out_false,
-                                     oneapi::dpl::discard_iterator()),
-      internal::predicate_key_fun<Pred>(p));
-  return std::make_pair(std::get<0>(ret_val.first.base()),
-                        std::get<0>(ret_val.second.base()));
-}
-
-template <typename Policy, typename Iter1, typename Iter3, typename Iter4,
-          typename Pred>
-internal::enable_if_execution_policy<Policy, std::pair<Iter3, Iter4>>
-stable_partition_copy(Policy &&policy, Iter1 first, Iter1 last, Iter3 out_true,
-                      Iter4 out_false, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  return std::partition_copy(std::forward<Policy>(policy), first, last,
-                             out_true, out_false, p);
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4, typename Pred>
-internal::enable_if_execution_policy<Policy, std::pair<Iter3, Iter4>>
-partition_copy(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask,
-               Iter3 out_true, Iter4 out_false, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter3>::iterator_category,
-                       std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter4>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  return stable_partition_copy(std::forward<Policy>(policy), first, last, mask,
-                               out_true, out_false, p);
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Pred>
-internal::enable_if_hetero_execution_policy<Policy, Iter1>
-stable_partition(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  typedef typename std::decay<Policy>::type policy_type;
-  internal::__buffer<typename std::iterator_traits<Iter1>::value_type> _tmp(
-      std::distance(first, last));
-
-  std::copy(policy, mask, mask + std::distance(first, last), _tmp.get());
-
-  auto ret_val =
-      std::stable_partition(std::forward<Policy>(policy),
-                            oneapi::dpl::make_zip_iterator(first, _tmp.get()),
-                            oneapi::dpl::make_zip_iterator(
-                                last, _tmp.get() + std::distance(first, last)),
-                            internal::predicate_key_fun<Pred>(p));
-  return std::get<0>(ret_val.base());
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Pred>
-typename std::enable_if<!internal::is_hetero_execution_policy<
-                            typename std::decay<Policy>::type>::value,
-                        Iter1>::type
-stable_partition(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  typedef typename std::decay<Policy>::type policy_type;
-  std::vector<typename std::iterator_traits<Iter1>::value_type> _tmp(
-      std::distance(first, last));
-
-  std::copy(policy, mask, mask + std::distance(first, last), _tmp.begin());
-
-  auto ret_val = std::stable_partition(
-      std::forward<Policy>(policy),
-      oneapi::dpl::make_zip_iterator(first, _tmp.begin()),
-      oneapi::dpl::make_zip_iterator(last,
-                                     _tmp.begin() + std::distance(first, last)),
-      internal::predicate_key_fun<Pred>(p));
-  return std::get<0>(ret_val.base());
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Pred>
-internal::enable_if_execution_policy<Policy, Iter1>
-partition(Policy &&policy, Iter1 first, Iter1 last, Iter2 mask, Pred p) {
-  static_assert(
-      std::is_same<typename std::iterator_traits<Iter1>::iterator_category,
-                   std::random_access_iterator_tag>::value &&
-          std::is_same<typename std::iterator_traits<Iter2>::iterator_category,
-                       std::random_access_iterator_tag>::value,
-      "Iterators passed to algorithms must be random-access iterators.");
-  return stable_partition(std::forward<Policy>(policy), first, last, mask, p);
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4>
-inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
-                          dpct::internal::is_iterator<Iter2>::value &&
-                          dpct::internal::is_iterator<Iter3>::value &&
-                          dpct::internal::is_iterator<Iter4>::value>
-sort_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
-           Iter4 values_out, ::std::int64_t n, bool descending = false,
-           int begin_bit = 0,
-           int end_bit =
-               sizeof(typename ::std::iterator_traits<Iter1>::value_type) * 8);
-
-template <typename Policy, typename Iter1, typename Iter2>
-inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
-                          dpct::internal::is_iterator<Iter2>::value>
-sort_keys(Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
-          bool descending = false, int begin_bit = 0,
-          int end_bit =
-              sizeof(typename ::std::iterator_traits<Iter1>::value_type) * 8);
-
-namespace internal {
-
-// Transforms key to a specific bit range and sorts the transformed key
-template <typename Policy, typename Iter1, typename Iter2,
-          typename TransformedKeyT>
-inline void transform_and_sort(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
-                               ::std::int64_t n, bool descending, int begin_bit,
-                               int end_bit) {
-  using key_t_value_t = typename std::iterator_traits<Iter1>::value_type;
-  auto trans_key =
-      translate_key<key_t_value_t, TransformedKeyT>(begin_bit, end_bit);
-
-  // Use of the comparison operator that is not simply std::greater() or
-  // std::less() will result in
-  //  not using radix sort which will cost some performance.  However, this is
-  //  necessary to provide the transformation of the key to the bitrange
-  //  desired.
-  auto partial_sort_with_comp = [&](const auto &comp) {
-    return oneapi::dpl::partial_sort_copy(
-        std::forward<Policy>(policy), keys_in, keys_in + n, keys_out,
-        keys_out + n, [=](const auto a, const auto b) {
-          return comp(trans_key(a), trans_key(b));
-        });
-  };
-  if (descending)
-    partial_sort_with_comp(::std::greater<TransformedKeyT>());
-  else
-    partial_sort_with_comp(::std::less<TransformedKeyT>());
-}
-
-template <typename Policy, typename Iter1, typename Iter2>
-inline void sort_only(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
-                      ::std::int64_t n, bool descending) {
-  using key_t_value_t = typename ::std::iterator_traits<Iter1>::value_type;
-
-  if constexpr (::std::is_floating_point<key_t_value_t>::value) {
-    if (descending) {
-      // Comparison operator that is not std::greater() ensures stability of
-      // -0.0 and 0.0
-      // at the cost of some performance because radix sort will not be used.
-      auto comp_descending = [=](const auto a, const auto b) { return a > b; };
-
-      oneapi::dpl::partial_sort_copy(::std::forward<Policy>(policy), keys_in,
-                                     keys_in + n, keys_out, keys_out + n,
-                                     comp_descending);
-    } else {
-      // Comparison operator that is not std::less() ensures stability of -0.0
-      // and 0.0
-      // at the cost of some performance because radix sort will not be used.
-      auto comp_ascending = [=](const auto a, const auto b) { return a < b; };
-
-      oneapi::dpl::partial_sort_copy(::std::forward<Policy>(policy), keys_in,
-                                     keys_in + n, keys_out, keys_out + n,
-                                     comp_ascending);
-    }
-  } else {
-    if (descending) {
-      oneapi::dpl::partial_sort_copy(::std::forward<Policy>(policy), keys_in,
-                                     keys_in + n, keys_out, keys_out + n,
-                                     ::std::greater<key_t_value_t>());
-    } else {
-
-      oneapi::dpl::partial_sort_copy(::std::forward<Policy>(policy), keys_in,
-                                     keys_in + n, keys_out, keys_out + n);
-    }
-  }
-}
-
-// Transforms key from a pair to a specific bit range and sorts the pairs by the
-// transformed key
-template <typename Policy, typename Iter1, typename Iter2,
-          typename TransformedKeyT, typename Iter3, typename Iter4>
-inline void
-transform_and_sort_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
-                         Iter3 values_in, Iter4 values_out, ::std::int64_t n,
-                         bool descending, int begin_bit, int end_bit) {
-  using key_t_value_t = typename std::iterator_traits<Iter1>::value_type;
-  auto zip_input = oneapi::dpl::zip_iterator(keys_in, values_in);
-  auto zip_output = oneapi::dpl::zip_iterator(keys_out, values_out);
-  auto trans_key =
-      translate_key<key_t_value_t, TransformedKeyT>(begin_bit, end_bit);
-
-  // Use of the comparison operator that is not simply std::greater() or
-  // std::less() will result in
-  //  not using radix sort which will cost some performance.  However, this is
-  //  necessary to provide the transformation of the key to the bitrange desired
-  //  and also to select the key from the zipped pair.
-  auto load_val = [=](const auto a) { return trans_key(std::get<0>(a)); };
-
-  auto partial_sort_with_comp = [&](const auto &comp) {
-    return oneapi::dpl::partial_sort_copy(
-        std::forward<Policy>(policy), zip_input, zip_input + n, zip_output,
-        zip_output + n, [=](const auto a, const auto b) {
-          return comp(load_val(a), load_val(b));
-        });
-  };
-  if (descending)
-    partial_sort_with_comp(::std::greater<key_t_value_t>());
-  else
-    partial_sort_with_comp(::std::less<key_t_value_t>());
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4>
-inline void sort_only_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
-                            Iter3 values_in, Iter4 values_out, ::std::int64_t n,
-                            bool descending) {
-  using key_t_value_t = typename ::std::iterator_traits<Iter1>::value_type;
-  auto zip_input = oneapi::dpl::zip_iterator(keys_in, values_in);
-  auto zip_output = oneapi::dpl::zip_iterator(keys_out, values_out);
-
-  // Use of the comparison operator that is not simply std::greater() or
-  // std::less() will result in
-  //  not using radix sort which will cost some performance.  However, this is
-  //  necessary to select the key from the zipped pair.
-  auto load_val = [=](const auto a) { return std::get<0>(a); };
-
-  auto partial_sort_with_comp = [&](const auto &comp) {
-    return oneapi::dpl::partial_sort_copy(
-        std::forward<Policy>(policy), zip_input, zip_input + n, zip_output,
-        zip_output + n, [=](const auto a, const auto b) {
-          return comp(load_val(a), load_val(b));
-        });
-  };
-  if (descending)
-    partial_sort_with_comp(::std::greater<key_t_value_t>());
-  else
-    partial_sort_with_comp(::std::less<key_t_value_t>());
-}
-
-// overload for Iter2 != std::nullptr_t
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4>
-typename ::std::enable_if<!::std::is_null_pointer<Iter2>::value>::type
-sort_pairs_impl(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
-                Iter4 values_out, ::std::int64_t n, bool descending,
-                int begin_bit, int end_bit) {
-  using key_t_value_t = typename ::std::iterator_traits<Iter1>::value_type;
-
-  int clipped_begin_bit = ::std::max(begin_bit, 0);
-  int clipped_end_bit =
-      ::std::min((::std::uint64_t)end_bit, sizeof(key_t_value_t) * 8);
-  int num_bytes = (clipped_end_bit - clipped_begin_bit - 1) / 8 + 1;
-
-  auto transform_and_sort_pairs_f = [&](auto x) {
-    using T = typename ::std::decay_t<decltype(x)>;
-    internal::transform_and_sort_pairs<decltype(policy), Iter1, Iter2, T, Iter3,
-                                       Iter4>(
-        ::std::forward<Policy>(policy), keys_in, keys_out, values_in,
-        values_out, n, descending, clipped_begin_bit, clipped_end_bit);
-  };
-
-  if (clipped_end_bit - clipped_begin_bit == sizeof(key_t_value_t) * 8) {
-    internal::sort_only_pairs(::std::forward<Policy>(policy), keys_in, keys_out,
-                              values_in, values_out, n, descending);
-  } else if (num_bytes == 1) {
-    transform_and_sort_pairs_f.template operator()<uint8_t>(0);
-  } else if (num_bytes == 2) {
-    transform_and_sort_pairs_f.template operator()<uint16_t>(0);
-  } else if (num_bytes <= 4) {
-    transform_and_sort_pairs_f.template operator()<uint32_t>(0);
-  } else // if (num_bytes <= 8)
-  {
-    transform_and_sort_pairs_f.template operator()<::std::uint64_t>(0);
-  }
-}
-
-// overload for Iter2 == std::nullptr_t
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4>
-typename ::std::enable_if<::std::is_null_pointer<Iter2>::value>::type
-sort_pairs_impl(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
-                Iter4 values_out, ::std::int64_t n, bool descending,
-                int begin_bit, int end_bit) {
-  // create temporary keys_out to discard, memory footprint could be improved by
-  // a specialized iterator with a single
-  // unchanging dummy Iter1 element
-  using key_t_value_t = typename std::iterator_traits<Iter1>::value_type;
-  sycl::buffer<key_t_value_t, 1> temp_keys_out{sycl::range<1>(n)};
-  internal::sort_pairs_impl(std::forward<Policy>(policy), keys_in,
-                            oneapi::dpl::begin(temp_keys_out), values_in,
-                            values_out, n, descending, begin_bit, end_bit);
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4, typename Iter5>
-inline void segmented_sort_pairs_by_parallel_sorts(
-    Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter4 values_in,
-    Iter3 values_out, ::std::int64_t n, ::std::int64_t nsegments,
-    Iter5 begin_offsets, Iter5 end_offsets, bool descending = false,
-    int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  using offset_type = typename ::std::iterator_traits<Iter5>::value_type;
-  ::std::vector<offset_type> host_accessible_offset_starts(nsegments);
-  ::std::vector<offset_type> host_accessible_offset_ends(nsegments);
-  // make offsets accessible on host
-  ::std::copy(policy, begin_offsets, begin_offsets + nsegments,
-              host_accessible_offset_starts.begin());
-  ::std::copy(policy, end_offsets, end_offsets + nsegments,
-              host_accessible_offset_ends.begin());
-
-  for (::std::uint64_t i = 0; i < nsegments; i++) {
-    ::std::uint64_t segment_begin = host_accessible_offset_starts[i];
-    ::std::uint64_t segment_end =
-        ::std::min(n, (::std::int64_t)host_accessible_offset_ends[i]);
-    if (segment_begin < segment_end) {
-      ::dpct::sort_pairs(
-          policy, keys_in + segment_begin, keys_out + segment_begin,
-          values_in + segment_begin, values_out + segment_begin,
-          segment_end - segment_begin, descending, begin_bit, end_bit);
-    }
-  }
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
-inline void segmented_sort_keys_by_parallel_sorts(
-    Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
-    ::std::int64_t nsegments, Iter3 begin_offsets, Iter3 end_offsets,
-    bool descending = false, int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  using offset_type = typename ::std::iterator_traits<Iter3>::value_type;
-  ::std::vector<offset_type> host_accessible_offset_starts(nsegments);
-  ::std::vector<offset_type> host_accessible_offset_ends(nsegments);
-  // make offsets accessible on host
-  ::std::copy(policy, begin_offsets, begin_offsets + nsegments,
-              host_accessible_offset_starts.begin());
-  ::std::copy(policy, end_offsets, end_offsets + nsegments,
-              host_accessible_offset_ends.begin());
-
-  for (::std::uint64_t i = 0; i < nsegments; i++) {
-    ::std::uint64_t segment_begin = host_accessible_offset_starts[i];
-    ::std::uint64_t segment_end =
-        ::std::min(n, (::std::int64_t)host_accessible_offset_ends[i]);
-    if (segment_begin < segment_end) {
-      ::dpct::sort_keys(policy, keys_in + segment_begin,
-                        keys_out + segment_begin, segment_end - segment_begin,
-                        descending, begin_bit, end_bit);
-    }
-  }
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4, typename Iter5>
-inline void segmented_sort_pairs_by_parallel_for_of_sorts(
-    Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
-    Iter4 values_out, ::std::int64_t n, ::std::int64_t nsegments,
-    Iter5 begin_offsets, Iter5 end_offsets, bool descending = false,
-    int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  policy.queue().submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(nsegments, [=](sycl::id<1> i) {
-      ::std::uint64_t segment_begin = begin_offsets[i];
-      ::std::uint64_t segment_end =
-          ::std::min(n, (::std::int64_t)end_offsets[i]);
-      if (segment_begin == segment_end) {
-        return;
-      }
-      ::dpct::sort_pairs(::std::execution::seq, keys_in + segment_begin,
-                         keys_out + segment_begin, values_in + segment_begin,
-                         values_out + segment_begin,
-                         segment_end - segment_begin, descending, begin_bit,
-                         end_bit);
-    });
-  });
-  policy.queue().wait();
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
-inline void segmented_sort_keys_by_parallel_for_of_sorts(
-    Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
-    ::std::int64_t nsegments, Iter3 begin_offsets, Iter3 end_offsets,
-    bool descending = false, int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  policy.queue().submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(nsegments, [=](sycl::id<1> i) {
-      ::std::uint64_t segment_begin = begin_offsets[i];
-      ::std::uint64_t segment_end =
-          ::std::min(n, (::std::int64_t)end_offsets[i]);
-      if (segment_begin == segment_end) {
-        return;
-      }
-      ::dpct::sort_keys(::std::execution::seq, keys_in + segment_begin,
-                        keys_out + segment_begin, segment_end - segment_begin,
-                        descending, begin_bit, end_bit);
-    });
-  });
-  policy.queue().wait();
-}
-
-template <typename Policy, typename OffsetIteratorT>
-inline void mark_segments(Policy &&policy, OffsetIteratorT begin_offsets,
-                          OffsetIteratorT end_offsets, ::std::int64_t n,
-                          ::std::int64_t nsegments,
-                          sycl::buffer<::std::size_t, 1> segments) {
-
-  ::std::size_t work_group_size =
-      policy.queue()
-          .get_device()
-          .template get_info<sycl::info::device::max_work_group_size>();
-
-  auto sg_sizes = policy.queue()
-                      .get_device()
-                      .template get_info<sycl::info::device::sub_group_sizes>();
-  ::std::size_t sub_group_size = sg_sizes.empty() ? 0 : sg_sizes.back();
-
-  float avg_seg_size = (float)n / (float)nsegments;
-  if (avg_seg_size > work_group_size) {
-    // If average segment size is larger than workgroup, use workgroup to
-    // coordinate to mark segments
-    policy.queue()
-        .submit([&](sycl::handler &h) {
-          auto segments_acc = segments.get_access<sycl::access_mode::write>(h);
-          h.parallel_for(work_group_size, ([=](sycl::id<1> id) {
-                           for (::std::size_t seg = 0; seg < nsegments; seg++) {
-                             ::std::size_t i = begin_offsets[seg];
-                             ::std::size_t end = end_offsets[seg];
-                             while (i + id < end) {
-                               segments_acc[i + id] = seg;
-                               i += work_group_size;
-                             }
-                           }
-                         }));
-        })
-        .wait();
-  } else if (sub_group_size > 0 && avg_seg_size > sub_group_size / 2) {
-    // If average segment size is larger than half a subgroup, use subgroup to
-    // coordinate to mark segments
-    policy.queue()
-        .submit([&](sycl::handler &h) {
-          auto segments_acc = segments.get_access<sycl::access_mode::write>(h);
-          h.parallel_for(
-              sycl::nd_range<1>{work_group_size, work_group_size},
-              ([=](sycl::nd_item<1> item) {
-                auto sub_group = item.get_sub_group();
-                ::std::size_t num_subgroups =
-                    sub_group.get_group_range().size();
-                ::std::size_t local_size = sub_group.get_local_range().size();
-
-                ::std::size_t sub_group_id = sub_group.get_group_id();
-                while (sub_group_id < nsegments) {
-                  ::std::size_t subgroup_local_id = sub_group.get_local_id();
-                  ::std::size_t i = begin_offsets[sub_group_id];
-                  ::std::size_t end = end_offsets[sub_group_id];
-                  while (i + subgroup_local_id < end) {
-                    segments_acc[i + subgroup_local_id] = sub_group_id;
-                    i += local_size;
-                  }
-                  sub_group_id += num_subgroups;
-                }
-              }));
-        })
-        .wait();
-  } else {
-    // If average segment size is small as compared to subgroup, use single
-    // work item to mark each segment
-    policy.queue()
-        .submit([&](sycl::handler &h) {
-          auto segments_acc = segments.get_access<sycl::access_mode::write>(h);
-          h.parallel_for(nsegments, ([=](sycl::id<1> seg) {
-                           for (::std::size_t i = begin_offsets[seg];
-                                i < end_offsets[seg]; i++) {
-                             segments_acc[i] = seg;
-                           }
-                         }));
-        })
-        .wait();
-  }
-}
-
-// The dpl_histogram namespace contains a temporary preview of an upcoming
-// oneDPL histogram API.  This namespace will be removed and replaced with
-// corresponding calls to oneapi::dpl::histogram()
-namespace dpl_histogram {
-
-template <typename T1, typename T2>
-constexpr inline auto __ceiling_div(const T1 &number, const T2 &divisor) {
-  return (number - 1) / divisor + 1;
-}
-
-template <typename T1, bool _IsFloatingPoint>
-struct __evenly_divided_binhash_impl {};
-
-template <typename T>
-struct __evenly_divided_binhash_impl<T, /* is_floating_point = */ true> {
-  T __minimum;
-  ::std::uint32_t __num_bins;
-  T __scale;
-  T __maximum;
-  __evenly_divided_binhash_impl(const T &min, const T &max,
-                                const ::std::uint32_t &num_bins)
-      : __minimum(min), __maximum(max), __num_bins(num_bins),
-        __scale(T(num_bins) / (max - min)) {}
-  template <typename T2> std::uint32_t operator()(T2 &&value) const {
-    return ::std::uint32_t((::std::forward<T2>(value) - __minimum) * __scale);
-  }
-
-  template <typename T2> bool is_valid(const T2 &value) const {
-    return value >= __minimum && value < __maximum;
-  }
-};
-
-// non floating point type
-template <typename T>
-struct __evenly_divided_binhash_impl<T, /* is_floating_point= */ false> {
-  T __minimum;
-  ::std::uint32_t __num_bins;
-  T __range_size;
-  __evenly_divided_binhash_impl(const T &min, const T &max,
-                                const ::std::uint32_t &num_bins)
-      : __minimum(min), __num_bins(num_bins), __range_size(max - min) {}
-  template <typename T2>::std::uint32_t operator()(T2 &&value) const {
-    return ::std::uint32_t(
-        ((::std::uint64_t(::std::forward<T2>(value)) - __minimum) *
-         ::std::uint64_t(__num_bins)) /
-        __range_size);
-  }
-
-  template <typename T2> bool is_valid(const T2 &value) const {
-    return value >= __minimum && value < __minimum + __range_size;
-  }
-};
-
-template <typename T1>
-using __evenly_divided_binhash =
-    __evenly_divided_binhash_impl<T1, ::std::is_floating_point_v<T1>>;
-
-template <typename Range> struct __custom_range_binhash {
-  Range __boundaries;
-  __custom_range_binhash(Range boundaries) : __boundaries(boundaries) {}
-
-  template <typename T>::std::uint32_t operator()(T &&value) const {
-    return (::std::upper_bound(__boundaries.begin(), __boundaries.end(),
-                               ::std::forward<T>(value)) -
-            __boundaries.begin()) -
-           1;
-  }
-
-  template <typename T2> bool is_valid(const T2 &value) const {
-    return value >= __boundaries[0] &&
-           value < __boundaries[__boundaries.size() - 1];
-  }
-};
-
-template <typename HistAccessor, typename OffsetT, typename Size>
-inline void __clear_wglocal_histograms(const HistAccessor &local_histogram,
-                                       const OffsetT &offset,
-                                       const Size &num_bins,
-                                       const sycl::nd_item<1> &self_item) {
-  ::std::uint32_t gSize = self_item.get_local_range()[0];
-  ::std::uint32_t self_lidx = self_item.get_local_id(0);
-  ::std::uint8_t factor = __ceiling_div(num_bins, gSize);
-  ::std::uint8_t k;
-  _DPCT_PRAGMA_UNROLL
-  for (k = 0; k < factor - 1; k++) {
-    local_histogram[offset + gSize * k + self_lidx] = 0;
-  }
-  if (gSize * k + self_lidx < num_bins) {
-    local_histogram[offset + gSize * k + self_lidx] = 0;
-  }
-  self_item.barrier(sycl::access::fence_space::local_space);
-}
-
-template <typename BinIdxType, typename Iter1, typename HistReg,
-          typename BinFunc>
-inline void __accum_local_register_iter(const Iter1 &in_acc,
-                                        const ::std::size_t &index,
-                                        HistReg *histogram, BinFunc func) {
-  const auto &x = in_acc[index];
-  if (func.is_valid(x)) {
-    BinIdxType c = func(x);
-    histogram[c]++;
-  }
-}
-
-template <typename BinIdxType, sycl::access::address_space AddressSpace,
-          typename Iter1, typename HistAccessor, typename OffsetT,
-          typename BinFunc>
-inline void __accum_local_atomics_iter(const Iter1 &in_acc,
-                                       const ::std::size_t &index,
-                                       const HistAccessor &wg_local_histogram,
-                                       const OffsetT &offset, BinFunc func) {
-  using __histo_value_type = typename HistAccessor::value_type;
-  const auto &x = in_acc[index];
-  if (func.is_valid(x)) {
-    BinIdxType c = func(x);
-    sycl::atomic_ref<__histo_value_type, sycl::memory_order::relaxed,
-                     sycl::memory_scope::work_group, AddressSpace>
-        local_bin(wg_local_histogram[offset + c]);
-    local_bin++;
-  }
-}
-
-template <typename BinType, typename HistAccessorIn, typename OffsetT,
-          typename HistAccessorOut, typename Size>
-inline void __reduce_out_histograms(const HistAccessorIn &in_histogram,
-                                    const OffsetT &offset,
-                                    const HistAccessorOut &out_histogram,
-                                    const Size &num_bins,
-                                    const sycl::nd_item<1> &self_item) {
-  ::std::uint32_t gSize = self_item.get_local_range()[0];
-  ::std::uint32_t self_lidx = self_item.get_local_id(0);
-  ::std::uint8_t factor = __ceiling_div(num_bins, gSize);
-  ::std::uint8_t k;
-
-  _DPCT_PRAGMA_UNROLL
-  for (k = 0; k < factor - 1; k++) {
-    sycl::atomic_ref<BinType, sycl::memory_order::relaxed,
-                     sycl::memory_scope::device,
-                     sycl::access::address_space::global_space>
-        global_bin(out_histogram[gSize * k + self_lidx]);
-    global_bin += in_histogram[offset + gSize * k + self_lidx];
-  }
-  if (gSize * k + self_lidx < num_bins) {
-    sycl::atomic_ref<BinType, sycl::memory_order::relaxed,
-                     sycl::memory_scope::device,
-                     sycl::access::address_space::global_space>
-        global_bin(out_histogram[gSize * k + self_lidx]);
-    global_bin += in_histogram[offset + gSize * k + self_lidx];
-  }
-}
-
-template <::std::uint16_t ItersPerWorkItem, ::std::uint8_t BinsPerWorkItem,
-          typename BinType, typename Policy, typename Range1, typename Range2,
-          typename Size, typename IdxHashFunc, typename... Range3>
-inline void __histogram_general_registers_local_reduction(
-    Policy &&policy, ::std::uint16_t work_group_size, Range1 &&input,
-    Range2 &&bins, const Size &num_bins, IdxHashFunc func,
-    Range3 &&...opt_range) {
-  const ::std::size_t N = input.size();
-  using __local_histogram_type = ::std::uint32_t;
-  using __private_histogram_type = ::std::uint16_t;
-
-  ::std::size_t segments = __ceiling_div(N, work_group_size * ItersPerWorkItem);
-  auto e = policy.queue().submit([&](auto &h) {
-    // Temporary use of stable non-public API from oneDPL,  this function will
-    // be replaced with oneDPL call in an upcoming release.
-    oneapi::dpl::__ranges::__require_access(h, input, bins, opt_range...);
-    sycl::local_accessor<__local_histogram_type, 1> local_histogram(
-        sycl::range(num_bins), h);
-    h.parallel_for(
-        sycl::nd_range<1>(segments * work_group_size, work_group_size),
-        [=](sycl::nd_item<1> __self_item) {
-          using __bin_idx_type = ::std::uint8_t;
-          const ::std::size_t __self_lidx = __self_item.get_local_id(0);
-          const ::std::size_t __wgroup_idx = __self_item.get_group(0);
-          const ::std::size_t __seg_start =
-              work_group_size * ItersPerWorkItem * __wgroup_idx;
-
-          __clear_wglocal_histograms(local_histogram, 0, num_bins, __self_item);
-          __private_histogram_type histogram[BinsPerWorkItem];
-          _DPCT_PRAGMA_UNROLL
-          for (::std::uint8_t k = 0; k < BinsPerWorkItem; k++) {
-            histogram[k] = 0;
-          }
-
-          if (__seg_start + work_group_size * ItersPerWorkItem < N) {
-            _DPCT_PRAGMA_UNROLL
-            for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) {
-              __accum_local_register_iter<__bin_idx_type>(
-                  input, __seg_start + idx * work_group_size + __self_lidx,
-                  histogram, func);
-            }
-          } else {
-            _DPCT_PRAGMA_UNROLL
-            for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) {
-              ::std::size_t __val_idx =
-                  __seg_start + idx * work_group_size + __self_lidx;
-              if (__val_idx < N) {
-                __accum_local_register_iter<__bin_idx_type>(input, __val_idx,
-                                                            histogram, func);
-              }
-            }
-          }
-
-          _DPCT_PRAGMA_UNROLL
-          for (::std::uint8_t k = 0; k < num_bins; k++) {
-            sycl::atomic_ref<__local_histogram_type,
-                             sycl::memory_order::relaxed,
-                             sycl::memory_scope::work_group,
-                             sycl::access::address_space::local_space>
-                local_bin(local_histogram[k]);
-            local_bin += histogram[k];
-          }
-
-          __self_item.barrier(sycl::access::fence_space::local_space);
-
-          __reduce_out_histograms<BinType>(local_histogram, 0, bins, num_bins,
-                                           __self_item);
-        });
-  });
-  e.wait();
-}
-
-template <::std::uint16_t ItersPerWorkItem, typename BinType, typename Policy,
-          typename Range1, typename Range2, typename Size, typename IdxHashFunc,
-          typename... Range3>
-inline void __histogram_general_local_atomics(Policy &&policy,
-                                              ::std::uint16_t work_group_size,
-                                              Range1 &&input, Range2 &&bins,
-                                              const Size &num_bins,
-                                              IdxHashFunc func,
-                                              Range3 &&...opt_range) {
-  const ::std::size_t N = input.size();
-  ::std::size_t segments = __ceiling_div(N, work_group_size * ItersPerWorkItem);
-  auto e = policy.queue().submit([&](auto &h) {
-    // Temporary use of stable non-public API from oneDPL,  this function will
-    // be replaced with oneDPL call in an upcoming release.
-    oneapi::dpl::__ranges::__require_access(h, input, bins, opt_range...);
-    sycl::local_accessor<::std::uint32_t, 1> local_histogram(
-        sycl::range(num_bins), h);
-    h.parallel_for(
-        sycl::nd_range<1>(segments * work_group_size, work_group_size),
-        [=](sycl::nd_item<1> __self_item) {
-          using __bin_idx_type = ::std::uint16_t;
-          constexpr auto __atomic_address_space =
-              sycl::access::address_space::local_space;
-          const ::std::size_t __self_lidx = __self_item.get_local_id(0);
-          const ::std::uint32_t __wgroup_idx = __self_item.get_group(0);
-          const ::std::size_t __seg_start =
-              work_group_size * __wgroup_idx * ItersPerWorkItem;
-
-          __clear_wglocal_histograms(local_histogram, 0, num_bins, __self_item);
-
-          if (__seg_start + work_group_size * ItersPerWorkItem < N) {
-            _DPCT_PRAGMA_UNROLL
-            for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) {
-              __accum_local_atomics_iter<__bin_idx_type,
-                                         __atomic_address_space>(
-                  input, __seg_start + idx * work_group_size + __self_lidx,
-                  local_histogram, 0, func);
-            }
-          } else {
-            _DPCT_PRAGMA_UNROLL
-            for (::std::uint8_t idx = 0; idx < ItersPerWorkItem; idx++) {
-              ::std::size_t __val_idx =
-                  __seg_start + idx * work_group_size + __self_lidx;
-              if (__val_idx < N) {
-                __accum_local_atomics_iter<__bin_idx_type,
-                                           __atomic_address_space>(
-                    input, __val_idx, local_histogram, 0, func);
-              }
-            }
-          }
-          __self_item.barrier(sycl::access::fence_space::local_space);
-
-          __reduce_out_histograms<BinType>(local_histogram, 0, bins, num_bins,
-                                           __self_item);
-        });
-  });
-
-  e.wait();
-}
-
-template <::std::uint16_t __min_iters_per_work_item, typename BinType,
-          typename Policy, typename Range1, typename Range2, typename Size,
-          typename IdxHashFunc, typename... Range3>
-inline void __histogram_general_private_global_atomics(
-    Policy &&policy, ::std::uint16_t work_group_size, Range1 &&input,
-    Range2 &&bins, const Size &num_bins, IdxHashFunc func,
-    Range3 &&...opt_range) {
-
-  const ::std::size_t N = input.size();
-  auto __global_mem_size =
-      policy.queue()
-          .get_device()
-          .template get_info<sycl::info::device::global_mem_size>();
-  const ::std::size_t max_segments =
-      ::std::min(__global_mem_size / (num_bins * sizeof(BinType)),
-                 __ceiling_div(N, work_group_size * __min_iters_per_work_item));
-  const ::std::size_t iters_per_work_item =
-      __ceiling_div(N, max_segments * work_group_size);
-  ::std::size_t segments =
-      __ceiling_div(N, work_group_size * iters_per_work_item);
-
-  sycl::buffer<BinType, 1> private_histograms(
-      sycl::range<1>(segments * num_bins));
-
-  auto e = policy.queue().submit([&](auto &h) {
-    // Temporary use of stable non-public API from oneDPL,  this function will
-    // be replaced with oneDPL call in an upcoming release.
-    oneapi::dpl::__ranges::__require_access(h, input, bins, opt_range...);
-    sycl::accessor hacc_private(private_histograms, h, sycl::read_write,
-                                sycl::no_init);
-    h.parallel_for(
-        sycl::nd_range<1>(segments * work_group_size, work_group_size),
-        [=](sycl::nd_item<1> __self_item) {
-          using __bin_idx_type = ::std::uint32_t;
-          constexpr auto __atomic_address_space =
-              sycl::access::address_space::global_space;
-          const ::std::size_t __self_lidx = __self_item.get_local_id(0);
-          const ::std::size_t __wgroup_idx = __self_item.get_group(0);
-          const ::std::size_t __seg_start =
-              work_group_size * iters_per_work_item * __wgroup_idx;
-
-          __clear_wglocal_histograms(hacc_private, __wgroup_idx * num_bins,
-                                     num_bins, __self_item);
-          if (__seg_start + work_group_size * iters_per_work_item < N) {
-            for (::std::size_t idx = 0; idx < iters_per_work_item; idx++) {
-              __accum_local_atomics_iter<__bin_idx_type,
-                                         __atomic_address_space>(
-                  input, __seg_start + idx * work_group_size + __self_lidx,
-                  hacc_private, __wgroup_idx * num_bins, func);
-            }
-          } else {
-            for (::std::size_t idx = 0; idx < iters_per_work_item; idx++) {
-              ::std::size_t __val_idx =
-                  __seg_start + idx * work_group_size + __self_lidx;
-              if (__val_idx < N) {
-                __accum_local_atomics_iter<__bin_idx_type,
-                                           __atomic_address_space>(
-                    input, __val_idx, hacc_private, __wgroup_idx * num_bins,
-                    func);
-              }
-            }
-          }
-          __self_item.barrier(sycl::access::fence_space::local_space);
-
-          __reduce_out_histograms<BinType>(hacc_private,
-                                           __wgroup_idx * num_bins, bins,
-                                           num_bins, __self_item);
-        });
-  });
-  e.wait();
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Size,
-          typename IdxHashFunc, typename... Range>
-inline Iter2
-__histogram_general_select_best(Policy &&policy, Iter1 first, Iter1 last,
-                                Iter2 histogram_first, const Size &num_bins,
-                                IdxHashFunc func, Range &&...opt_range) {
-  using __histo_value_type = typename ::std::iterator_traits<Iter2>::value_type;
-  auto __local_mem_size =
-      policy.queue()
-          .get_device()
-          .template get_info<sycl::info::device::local_mem_size>();
-  constexpr ::std::uint8_t __max_registers = 16;
-
-  // Temporary use of stable non-public API from oneDPL,  this function will be
-  // replaced with oneDPL call in an upcoming release.
-  auto keep_bins = oneapi::dpl::__ranges::__get_sycl_range<
-      oneapi::dpl::__par_backend_hetero::access_mode::write, Iter2>();
-  auto bins_buf = keep_bins(histogram_first, histogram_first + num_bins);
-
-  oneapi::dpl::fill(policy, bins_buf.all_view().begin(),
-                    bins_buf.all_view().end(), __histo_value_type(0));
-  auto N = last - first;
-  if (N > 0) {
-    // Temporary use of stable non-public API from oneDPL,  this function will
-    // be replaced with oneDPL call in an upcoming release.
-    auto keep_input = oneapi::dpl::__ranges::__get_sycl_range<
-        oneapi::dpl::__par_backend_hetero::access_mode::read, Iter1>();
-    auto input_buf = keep_input(first, last);
-
-    ::std::size_t max_work_group_size =
-        policy.queue()
-            .get_device()
-            .template get_info<sycl::info::device::max_work_group_size>();
-    ::std::size_t work_group_size =
-        ::std::min(max_work_group_size, ::std::size_t(1024));
-
-    if (num_bins < __max_registers) {
-
-      // If bins fit into registers, use register private accumulation
-      __histogram_general_registers_local_reduction<32, 16, __histo_value_type>(
-          ::std::forward<Policy>(policy), work_group_size, input_buf.all_view(),
-          bins_buf.all_view(), num_bins, func,
-          ::std::forward<Range...>(opt_range)...);
-    } else if (num_bins * sizeof(__histo_value_type) < __local_mem_size) {
-      // If bins fit into SLM, use local atomics
-
-      // Experimentally determined iters per work-item
-      if (N <= 524288) {
-        __histogram_general_local_atomics<4, __histo_value_type>(
-            ::std::forward<Policy>(policy), work_group_size,
-            input_buf.all_view(), bins_buf.all_view(), num_bins, func,
-            ::std::forward<Range...>(opt_range)...);
-      } else {
-        __histogram_general_local_atomics<32, __histo_value_type>(
-            ::std::forward<Policy>(policy), work_group_size,
-            input_buf.all_view(), bins_buf.all_view(), num_bins, func,
-            ::std::forward<Range...>(opt_range)...);
-      }
-    } else // Otherwise, use global atomics (private copies per workgroup)
-    {
-      // Experimentally determined iters per work-item
-      if (N <= 524288) {
-        __histogram_general_private_global_atomics<4, __histo_value_type>(
-            ::std::forward<Policy>(policy), work_group_size,
-            input_buf.all_view(), bins_buf.all_view(), num_bins, func,
-            ::std::forward<Range...>(opt_range)...);
-      } else {
-        __histogram_general_private_global_atomics<32, __histo_value_type>(
-            ::std::forward<Policy>(policy), work_group_size,
-            input_buf.all_view(), bins_buf.all_view(), num_bins, func,
-            ::std::forward<Range...>(opt_range)...);
-      }
-    }
-  }
-  return histogram_first + num_bins;
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Size,
-          typename T>
-inline ::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-        dpct::internal::is_iterator<Iter2>::value &&
-        internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value,
-    Iter2>
-histogram(Policy &&policy, Iter1 first, Iter1 last, Iter2 histogram_first,
-          const Size &num_bins, const T &first_bin_min_val,
-          const T &last_bin_max_val) {
-  return __histogram_general_select_best(
-      ::std::forward<Policy>(policy), first, last, histogram_first, num_bins,
-      __evenly_divided_binhash<T>(first_bin_min_val, last_bin_max_val,
-                                  num_bins));
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
-inline ::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-        dpct::internal::is_iterator<Iter2>::value &&
-        dpct::internal::is_iterator<Iter3>::value &&
-        internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value,
-    Iter2>
-histogram(Policy &&policy, Iter1 first, Iter1 last, Iter2 histogram_first,
-          Iter3 boundary_first, Iter3 boundary_last) {
-  // Temporary use of stable non-public API from oneDPL,  this function will be
-  // replaced with oneDPL call in an upcoming release.
-  auto keep_boundaries = oneapi::dpl::__ranges::__get_sycl_range<
-      oneapi::dpl::__par_backend_hetero::access_mode::read, Iter3>();
-  auto boundary_buf = keep_boundaries(boundary_first, boundary_last);
-
-  return __histogram_general_select_best(
-      ::std::forward<Policy>(policy), first, last, histogram_first,
-      (boundary_last - boundary_first) - 1,
-      __custom_range_binhash{boundary_buf.all_view()}, boundary_buf.all_view());
-}
-} // end namespace dpl_histogram
-
-} // end namespace internal
-
-// Evenly Divided Histogram of a 1-D array
-template <typename Policy, typename Iter1, typename Iter2, typename T,
-          typename Size>
-::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-histogram_even(Policy &&policy, Iter1 d_samples, Iter2 d_histogram,
-               int num_levels, T lower_level, T upper_level, Size num_samples) {
-  internal::dpl_histogram::histogram(::std::forward<Policy>(policy), d_samples,
-                                     d_samples + num_samples, d_histogram,
-                                     num_levels - 1, lower_level, upper_level);
-}
-
-// Evenly Divided Histogram of a 2-D ROI in a flattened 2-D array
-template <typename Policy, typename Iter1, typename Iter2, typename T,
-          typename OffsetT>
-::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-histogram_even_roi(Policy &&policy, Iter1 d_samples, Iter2 d_histogram,
-                   int num_levels, T lower_level, T upper_level,
-                   OffsetT num_row_samples, OffsetT num_rows,
-                   ::std::size_t row_stride_bytes) {
-  return histogram_even(
-      ::std::forward<Policy>(policy),
-      oneapi::dpl::permutation_iterator(
-          d_samples,
-          internal::__roi_2d_index_functor(
-              num_row_samples,
-              row_stride_bytes /
-                  sizeof(typename ::std::iterator_traits<Iter1>::value_type))),
-      d_histogram, num_levels, lower_level, upper_level,
-      num_row_samples * num_rows);
-}
-
-// Evenly Divided Multi-Channel Histogram of a 1-D array
-template <int NumChannels, int NumActiveChannels, typename Policy,
-          typename Iter1, typename Iter2, typename T, typename Size>
-::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-multi_histogram_even(Policy &&policy, Iter1 d_samples,
-                     Iter2 d_histogram[NumActiveChannels],
-                     int num_levels[NumActiveChannels],
-                     T lower_level[NumActiveChannels],
-                     T upper_level[NumActiveChannels], Size num_pixels) {
-  for (int active_channel = 0; active_channel < NumActiveChannels;
-       active_channel++) {
-    histogram_even(
-        policy,
-        oneapi::dpl::permutation_iterator(
-            d_samples,
-            internal::__interleaved_index_functor(NumChannels, active_channel)),
-        d_histogram[active_channel], num_levels[active_channel],
-        lower_level[active_channel], upper_level[active_channel], num_pixels);
-  }
-}
-
-// Evenly Divided Multi-Channel Histogram of a 2-D ROI in a flattened 2-D array
-template <int NumChannels, int NumActiveChannels, typename Policy,
-          typename Iter1, typename Iter2, typename T, typename OffsetT>
-::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-multi_histogram_even_roi(Policy &&policy, Iter1 d_samples,
-                         Iter2 d_histogram[NumActiveChannels],
-                         int num_levels[NumActiveChannels],
-                         T lower_level[NumActiveChannels],
-                         T upper_level[NumActiveChannels],
-                         OffsetT num_row_samples, OffsetT num_rows,
-                         ::std::size_t row_stride_bytes) {
-  for (int active_channel = 0; active_channel < NumActiveChannels;
-       active_channel++) {
-    histogram_even(
-        policy,
-        oneapi::dpl::permutation_iterator(
-            d_samples,
-            internal::__composition_functor(
-                internal::__roi_2d_index_functor(
-                    num_row_samples,
-                    row_stride_bytes /
-                        (NumChannels * sizeof(typename ::std::iterator_traits<
-                                              Iter1>::value_type))),
-                internal::__interleaved_index_functor(NumChannels,
-                                                      active_channel))),
-        d_histogram[active_channel], num_levels[active_channel],
-        lower_level[active_channel], upper_level[active_channel],
-        num_row_samples * num_rows);
-  }
-}
-
-// Custom Range Histogram of a 1-D array
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Size>
-::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    dpct::internal::is_iterator<Iter3>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-histogram_range(Policy &&policy, Iter1 d_samples, Iter2 d_histogram,
-                int num_levels, Iter3 d_levels, Size num_samples) {
-  internal::dpl_histogram::histogram(::std::forward<Policy>(policy), d_samples,
-                                     d_samples + num_samples, d_histogram,
-                                     d_levels, d_levels + num_levels);
-}
-
-// Custom Range Histogram of a 2-D ROI in a flattened 2-D Array
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename OffsetT>
-::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    dpct::internal::is_iterator<Iter3>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-histogram_range_roi(Policy &&policy, Iter1 d_samples, Iter2 d_histogram,
-                    int num_levels, Iter3 d_levels, OffsetT num_row_samples,
-                    OffsetT num_rows, ::std::size_t row_stride_bytes) {
-  return histogram_range(
-      ::std::forward<Policy>(policy),
-      oneapi::dpl::permutation_iterator(
-          d_samples,
-          internal::__roi_2d_index_functor(
-              num_row_samples,
-              row_stride_bytes /
-                  sizeof(typename ::std::iterator_traits<Iter1>::value_type))),
-      d_histogram, num_levels, d_levels, num_row_samples * num_rows);
-}
-
-// Custom Range Multi-Channel Histogram of a 1-D array
-template <int NumChannels, int NumActiveChannels, typename Policy,
-          typename Iter1, typename Iter2, typename Iter3, typename Size>
-::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    dpct::internal::is_iterator<Iter3>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-multi_histogram_range(Policy &&policy, Iter1 d_samples,
-                      Iter2 d_histogram[NumActiveChannels],
-                      int num_levels[NumActiveChannels],
-                      Iter3 d_levels[NumActiveChannels], Size num_pixels) {
-  for (int active_channel = 0; active_channel < NumActiveChannels;
-       active_channel++) {
-    histogram_range(policy,
-                    oneapi::dpl::permutation_iterator(
-                        d_samples, internal::__interleaved_index_functor(
-                                       NumChannels, active_channel)),
-                    d_histogram[active_channel], num_levels[active_channel],
-                    d_levels[active_channel], num_pixels);
-  }
-}
-
-// Custom Range Multi-Channel Histogram of a 2-D ROI in a flattened 2-D array
-template <int NumChannels, int NumActiveChannels, typename Policy,
-          typename Iter1, typename Iter2, typename Iter3, typename OffsetT>
-::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    dpct::internal::is_iterator<Iter3>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-multi_histogram_range_roi(Policy &&policy, Iter1 d_samples,
-                          Iter2 d_histogram[NumActiveChannels],
-                          int num_levels[NumActiveChannels],
-                          Iter3 d_levels[NumActiveChannels],
-                          OffsetT num_row_samples, OffsetT num_rows,
-                          ::std::size_t row_stride_bytes) {
-  for (int active_channel = 0; active_channel < NumActiveChannels;
-       active_channel++) {
-    histogram_range(
-        policy,
-        oneapi::dpl::permutation_iterator(
-            d_samples,
-            internal::__composition_functor(
-                internal::__roi_2d_index_functor(
-                    num_row_samples,
-                    row_stride_bytes /
-                        (NumChannels * sizeof(typename ::std::iterator_traits<
-                                              Iter1>::value_type))),
-                internal::__interleaved_index_functor(NumChannels,
-                                                      active_channel))),
-        d_histogram[active_channel], num_levels[active_channel],
-        d_levels[active_channel], num_row_samples * num_rows);
-  }
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4>
-inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
-                          dpct::internal::is_iterator<Iter2>::value &&
-                          dpct::internal::is_iterator<Iter3>::value &&
-                          dpct::internal::is_iterator<Iter4>::value>
-sort_pairs(Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
-           Iter4 values_out, ::std::int64_t n, bool descending, int begin_bit,
-           int end_bit) {
-  internal::sort_pairs_impl(std::forward<Policy>(policy), keys_in, keys_out,
-                            values_in, values_out, n, descending, begin_bit,
-                            end_bit);
-}
-
-template <typename Policy, typename Iter1, typename Iter2>
-inline void sort_pairs(
-    Policy &&policy, io_iterator_pair<Iter1> &keys,
-    io_iterator_pair<Iter2> &values, ::std::int64_t n, bool descending = false,
-    bool do_swap_iters = false, int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  sort_pairs(::std::forward<Policy>(policy), keys.first(), keys.second(),
-             values.first(), values.second(), n, descending, begin_bit,
-             end_bit);
-  if (do_swap_iters) {
-    keys.swap();
-    values.swap();
-  }
-}
-
-template <typename Policy, typename Iter1, typename Iter2>
-inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
-                          dpct::internal::is_iterator<Iter2>::value>
-sort_keys(Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
-          bool descending, int begin_bit, int end_bit) {
-  using key_t_value_t = typename ::std::iterator_traits<Iter1>::value_type;
-
-  int clipped_begin_bit = ::std::max(begin_bit, 0);
-  int clipped_end_bit =
-      ::std::min((::std::uint64_t)end_bit, sizeof(key_t_value_t) * 8);
-  int num_bytes = (clipped_end_bit - clipped_begin_bit - 1) / 8 + 1;
-
-  auto transform_and_sort_f = [&](auto x) {
-    using T = typename ::std::decay_t<decltype(x)>;
-    internal::transform_and_sort<decltype(policy), Iter1, Iter2, T>(
-        ::std::forward<Policy>(policy), keys_in, keys_out, n, descending,
-        clipped_begin_bit, clipped_end_bit);
-  };
-
-  if (clipped_end_bit - clipped_begin_bit == sizeof(key_t_value_t) * 8) {
-    internal::sort_only(::std::forward<Policy>(policy), keys_in, keys_out, n,
-                        descending);
-  } else if (num_bytes == 1) {
-    transform_and_sort_f.template operator()<uint8_t>(0);
-  } else if (num_bytes == 2) {
-    transform_and_sort_f.template operator()<uint16_t>(0);
-  } else if (num_bytes <= 4) {
-    transform_and_sort_f.template operator()<uint32_t>(0);
-  } else // if (num_bytes <= 8)
-  {
-    transform_and_sort_f.template operator()<::std::uint64_t>(0);
-  }
-}
-
-template <typename Policy, typename Iter1>
-inline void sort_keys(
-    Policy &&policy, io_iterator_pair<Iter1> &keys, ::std::int64_t n,
-    bool descending = false, bool do_swap_iters = false, int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  sort_keys(std::forward<Policy>(policy), keys.first(), keys.second(), n,
-            descending, begin_bit, end_bit);
-  if (do_swap_iters)
-    keys.swap();
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
-inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
-                          dpct::internal::is_iterator<Iter2>::value>
-segmented_sort_keys(
-    Policy &&policy, Iter1 keys_in, Iter2 keys_out, ::std::int64_t n,
-    ::std::int64_t nsegments, Iter3 begin_offsets, Iter3 end_offsets,
-    bool descending = false, int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  int compute_units =
-      policy.queue()
-          .get_device()
-          .template get_info<sycl::info::device::max_compute_units>();
-  auto sg_sizes = policy.queue()
-                      .get_device()
-                      .template get_info<sycl::info::device::sub_group_sizes>();
-  int subgroup_size = sg_sizes.empty() ? 1 : sg_sizes.back();
-  // parallel for of serial sorts when we have sufficient number of segments for
-  // load balance when number of segments is large as compared to our target
-  // compute capability
-  if (nsegments >
-      compute_units *
-          (policy.queue().get_device().is_gpu() ? subgroup_size : 1)) {
-    dpct::internal::segmented_sort_keys_by_parallel_for_of_sorts(
-        ::std::forward<Policy>(policy), keys_in, keys_out, n, nsegments,
-        begin_offsets, end_offsets, descending, begin_bit, end_bit);
-  } else
-  {
-    dpct::internal::segmented_sort_keys_by_parallel_sorts(
-        ::std::forward<Policy>(policy), keys_in, keys_out, n, nsegments,
-        begin_offsets, end_offsets, descending, begin_bit, end_bit);
-  }
-}
-
-template <typename Policy, typename Iter1, typename Iter2>
-inline void segmented_sort_keys(
-    Policy &&policy, io_iterator_pair<Iter1> &keys, ::std::int64_t n,
-    ::std::int64_t nsegments, Iter2 begin_offsets, Iter2 end_offsets,
-    bool descending = false, bool do_swap_iters = false, int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  segmented_sort_keys(::std::forward<Policy>(policy), keys.first(),
-                      keys.second(), n, nsegments, begin_offsets, end_offsets,
-                      descending, begin_bit, end_bit);
-  if (do_swap_iters) {
-    keys.swap();
-  }
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3,
-          typename Iter4, typename Iter5>
-inline ::std::enable_if_t<dpct::internal::is_iterator<Iter1>::value &&
-                          dpct::internal::is_iterator<Iter2>::value &&
-                          dpct::internal::is_iterator<Iter3>::value &&
-                          dpct::internal::is_iterator<Iter4>::value>
-segmented_sort_pairs(
-    Policy &&policy, Iter1 keys_in, Iter2 keys_out, Iter3 values_in,
-    Iter4 values_out, ::std::int64_t n, ::std::int64_t nsegments,
-    Iter5 begin_offsets, Iter5 end_offsets, bool descending = false,
-    int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  int compute_units =
-      policy.queue()
-          .get_device()
-          .template get_info<sycl::info::device::max_compute_units>();
-  auto sg_sizes = policy.queue()
-                      .get_device()
-                      .template get_info<sycl::info::device::sub_group_sizes>();
-  int subgroup_size = sg_sizes.empty() ? 1 : sg_sizes.back();
-  // parallel for of serial sorts when we have sufficient number of segments for
-  // load balance when number of segments is large as compared to our target
-  // compute capability
-  if (nsegments >
-      compute_units *
-          (policy.queue().get_device().is_gpu() ? subgroup_size : 1)) {
-    dpct::internal::segmented_sort_pairs_by_parallel_for_of_sorts(
-        ::std::forward<Policy>(policy), keys_in, keys_out, values_in,
-        values_out, n, nsegments, begin_offsets, end_offsets, descending,
-        begin_bit, end_bit);
-  } else
-  {
-    dpct::internal::segmented_sort_pairs_by_parallel_sorts(
-        ::std::forward<Policy>(policy), keys_in, keys_out, values_in,
-        values_out, n, nsegments, begin_offsets, end_offsets, descending,
-        begin_bit, end_bit);
-  }
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
-inline void segmented_sort_pairs(
-    Policy &&policy, io_iterator_pair<Iter1> &keys,
-    io_iterator_pair<Iter2> &values, ::std::int64_t n, ::std::int64_t nsegments,
-    Iter3 begin_offsets, Iter3 end_offsets, bool descending = false,
-    bool do_swap_iters = false, int begin_bit = 0,
-    int end_bit = sizeof(typename ::std::iterator_traits<Iter1>::value_type) *
-                  8) {
-  segmented_sort_pairs(std::forward<Policy>(policy), keys.first(),
-                       keys.second(), values.first(), values.second(), n,
-                       nsegments, begin_offsets, end_offsets, descending,
-                       begin_bit, end_bit);
-  if (do_swap_iters) {
-    keys.swap();
-    values.swap();
-  }
-}
-
-template <typename Policy, typename Iter1, typename Iter2>
-inline void reduce_argmax(Policy &&policy, Iter1 input, Iter2 output,
-                          ::std::size_t n) {
-  dpct::arg_index_input_iterator<decltype(input), int> input_arg_idx(input);
-  auto ret = ::std::max_element(
-      policy, input_arg_idx, input_arg_idx + n,
-      [](const auto &a, const auto &b) { return (a.value < b.value); });
-  ::std::copy(::std::forward<Policy>(policy), ret, ret + 1, output);
-}
-
-template <typename Policy, typename Iter1, typename Iter2>
-inline void reduce_argmin(Policy &&policy, Iter1 input, Iter2 output,
-                          ::std::size_t n) {
-  dpct::arg_index_input_iterator<decltype(input), int> input_arg_idx(input);
-  auto ret = ::std::min_element(
-      policy, input_arg_idx, input_arg_idx + n,
-      [](const auto &a, const auto &b) { return (a.value < b.value); });
-  ::std::copy(::std::forward<Policy>(policy), ret, ret + 1, output);
-}
-
-template <typename Policy, typename Iter1, typename ValueT, typename CompT>
-inline ::std::pair<Iter1, Iter1> equal_range(Policy &&policy, Iter1 start,
-                                             Iter1 end, const ValueT &value,
-                                             CompT comp) {
-  ::std::vector<::std::int64_t> res_lower(1);
-  ::std::vector<::std::int64_t> res_upper(1);
-  ::std::vector<ValueT> value_vec(1, value);
-  ::oneapi::dpl::lower_bound(policy, start, end, value_vec.begin(),
-                             value_vec.end(), res_lower.begin(), comp);
-  ::oneapi::dpl::upper_bound(::std::forward<Policy>(policy), start, end,
-                             value_vec.begin(), value_vec.end(),
-                             res_upper.begin(), comp);
-  return ::std::make_pair(start + res_lower[0], start + res_upper[0]);
-}
-
-template <typename Policy, typename Iter1, typename ValueT>
-inline ::std::pair<Iter1, Iter1> equal_range(Policy &&policy, Iter1 start,
-                                             Iter1 end, const ValueT &value) {
-  return equal_range(::std::forward<Policy>(policy), start, end, value,
-                     internal::__less());
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
-inline ::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-segmented_reduce_argmin(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
-                        ::std::int64_t nsegments, Iter3 begin_offsets,
-                        Iter3 end_offsets) {
-  policy.queue().submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(nsegments, [=](sycl::id<1> i) {
-      if (end_offsets[i] <= begin_offsets[i]) {
-        keys_out[i] = dpct::key_value_pair(
-            1, ::std::numeric_limits<
-                   typename ::std::iterator_traits<Iter1>::value_type>::max());
-      } else {
-        dpct::arg_index_input_iterator<Iter1, int> arg_index(keys_in +
-                                                             begin_offsets[i]);
-        keys_out[i] = *::std::min_element(
-            arg_index, arg_index + (end_offsets[i] - begin_offsets[i]),
-            [](const auto &a, const auto &b) { return a.value < b.value; });
-      }
-    });
-  });
-  policy.queue().wait();
-}
-
-template <typename Policy, typename Iter1, typename Iter2, typename Iter3>
-inline ::std::enable_if_t<
-    dpct::internal::is_iterator<Iter1>::value &&
-    dpct::internal::is_iterator<Iter2>::value &&
-    internal::is_hetero_execution_policy<::std::decay_t<Policy>>::value>
-segmented_reduce_argmax(Policy &&policy, Iter1 keys_in, Iter2 keys_out,
-                        ::std::int64_t nsegments, Iter3 begin_offsets,
-                        Iter3 end_offsets) {
-  policy.queue().submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(nsegments, [=](sycl::id<1> i) {
-      if (end_offsets[i] <= begin_offsets[i]) {
-        keys_out[i] = dpct::key_value_pair(
-            1,
-            ::std::numeric_limits<
-                typename ::std::iterator_traits<Iter1>::value_type>::lowest());
-      } else {
-        dpct::arg_index_input_iterator<Iter1, int> arg_index(keys_in +
-                                                             begin_offsets[i]);
-        keys_out[i] = *::std::max_element(
-            arg_index, arg_index + (end_offsets[i] - begin_offsets[i]),
-            [](const auto &a, const auto &b) { return a.value < b.value; });
-      }
-    });
-  });
-  policy.queue().wait();
-}
-
-template <typename ExecutionPolicy, typename InputIterator,
-          typename OutputIterator1, typename OutputIterator2,
-          typename OutputIterator3>
-void nontrivial_run_length_encode(ExecutionPolicy &&policy,
-                                  InputIterator input_beg,
-                                  OutputIterator1 offsets_out,
-                                  OutputIterator2 lengths_out,
-                                  OutputIterator3 num_runs,
-                                  ::std::int64_t num_items) {
-  using oneapi::dpl::make_transform_iterator;
-  using oneapi::dpl::make_zip_iterator;
-  using offsets_t =
-      typename ::std::iterator_traits<OutputIterator1>::value_type;
-  using lengths_t =
-      typename ::std::iterator_traits<OutputIterator2>::value_type;
-
-  auto input_end = input_beg + num_items;
-  // First element must be nontrivial run (start of first segment)
-  auto first_adj_it = oneapi::dpl::adjacent_find(policy, input_beg, input_end);
-  auto first_adj_idx = ::std::distance(input_beg, first_adj_it);
-  if (first_adj_it == input_end) {
-    ::std::fill(policy, num_runs, num_runs + 1, 0);
-    return;
-  }
-  auto get_prev_idx_element = [first_adj_idx](const auto &idx) {
-    auto out_idx = idx + first_adj_idx;
-    return (out_idx == 0) ? 0 : out_idx - 1;
-  };
-  auto get_next_idx_element = [first_adj_idx, num_items](const auto &idx) {
-    auto out_idx = idx + first_adj_idx;
-    return (out_idx == num_items - 1) ? num_items - 1 : out_idx + 1;
-  };
-  // TODO: Use shifted view to pad range once oneDPL ranges is non-experimental
-  auto left_shifted_input_beg =
-      oneapi::dpl::make_permutation_iterator(input_beg, get_prev_idx_element);
-  auto right_shifted_input_beg =
-      oneapi::dpl::make_permutation_iterator(input_beg, get_next_idx_element);
-  // Segment type for ith idx consists of zip of iterators at (i-1, i, i+1)
-  // padded at the ends
-  auto zipped_keys_beg = make_zip_iterator(
-      left_shifted_input_beg, input_beg, right_shifted_input_beg,
-      oneapi::dpl::counting_iterator<offsets_t>(0));
-  // Set flag at the beginning of new nontrivial run (ex: (2, 3, 3) -> 1)
-  auto key_flags_beg =
-      make_transform_iterator(zipped_keys_beg, [num_items](const auto &zipped) {
-        using ::std::get;
-        bool last_idx_mask = get<3>(zipped) != num_items - 1;
-        return (get<0>(zipped) != get<1>(zipped) &&
-                get<1>(zipped) == get<2>(zipped)) &&
-               last_idx_mask;
-      });
-  auto count_beg = oneapi::dpl::counting_iterator<offsets_t>(0);
-  auto const_it = dpct::make_constant_iterator(lengths_t(1));
-  // Check for presence of nontrivial element at current index
-  auto tr_nontrivial_flags = make_transform_iterator(
-      make_zip_iterator(left_shifted_input_beg, input_beg),
-      [](const auto &zip) {
-        using ::std::get;
-        return get<0>(zip) == get<1>(zip);
-      });
-  auto zipped_vals_beg =
-      make_zip_iterator(tr_nontrivial_flags, count_beg, const_it);
-  auto pred = [](bool lhs, bool rhs) { return !rhs; };
-  auto op = [](auto lhs, const auto &rhs) {
-    using ::std::get;
-
-    // Update length count of run.
-    // The first call to this op will use the first element of the input as lhs
-    // and second element as rhs. get<0>(first_element) is ignored in favor of a
-    // constant `1` in get<2>, avoiding the need for special casing the first
-    // element. The constant `1` utilizes the knowledge that each segment begins
-    // with a nontrivial run.
-    get<2>(lhs) += get<0>(rhs);
-
-    // A run's starting index is stored in get<1>(lhs) as the initial value in
-    // the segment and is preserved throughout the segment's reduction as the
-    // nontrivial run's offset.
-
-    return ::std::move(lhs);
-  };
-  auto zipped_out_beg = make_zip_iterator(oneapi::dpl::discard_iterator(),
-                                          offsets_out, lengths_out);
-  auto [_, zipped_out_vals_end] = oneapi::dpl::reduce_by_segment(
-      policy, key_flags_beg + first_adj_idx, key_flags_beg + num_items,
-      zipped_vals_beg + first_adj_idx, oneapi::dpl::discard_iterator(),
-      zipped_out_beg, pred, op);
-  auto ret_dist = ::std::distance(zipped_out_beg, zipped_out_vals_end);
-  ::std::fill(policy, num_runs, num_runs + 1, ret_dist);
-}
-
-} // end namespace dpct
-
-#endif
diff --git a/dpct/dpl_extras/dpcpp_extensions.h b/dpct/dpl_extras/dpcpp_extensions.h
deleted file mode 100644
index 05a0068e65925..0000000000000
--- a/dpct/dpl_extras/dpcpp_extensions.h
+++ /dev/null
@@ -1,747 +0,0 @@
-//==---- dpcpp_extensions.h ------------------*- C++ -*---------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------===//
-
-#ifndef __DPCT_DPCPP_EXTENSIONS_H__
-#define __DPCT_DPCPP_EXTENSIONS_H__
-
-#include <sycl/sycl.hpp>
-#include <stdexcept>
-
-#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
-#include <sycl/ext/oneapi/experimental/user_defined_reductions.hpp>
-#endif
-
-#include "../dpct.hpp"
-#include "functional.h"
-
-namespace dpct {
-namespace group {
-namespace detail {
-
-template <typename... _Args>
-constexpr auto __reduce_over_group(_Args... __args) {
-  return sycl::reduce_over_group(__args...);
-}
-
-template <typename... _Args> constexpr auto __group_broadcast(_Args... __args) {
-  return sycl::group_broadcast(__args...);
-}
-
-template <typename... _Args>
-constexpr auto __exclusive_scan_over_group(_Args... __args) {
-  return sycl::exclusive_scan_over_group(__args...);
-}
-
-template <typename... _Args>
-constexpr auto __inclusive_scan_over_group(_Args... __args) {
-  return sycl::inclusive_scan_over_group(__args...);
-}
-
-} // end namespace detail
-
-/// Perform an exclusive scan over the values of inputs from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param inputs Pointer to the input data for the scan operation.
-/// \param outputs Pointer to the location where scan results will be stored.
-/// \param init initial value of the scan result.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan.
-template <typename Item, typename T, class BinaryOperation,
-          int VALUES_PER_THREAD>
-__dpct_inline__ void
-exclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
-               T (&outputs)[VALUES_PER_THREAD], T init,
-               BinaryOperation binary_op) {
-  T result = inputs[0];
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
-    result = binary_op(result, inputs[i]);
-  }
-
-  T exclusive_result =
-      detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
-
-  T input = inputs[0];
-  if (item.get_local_linear_id() == 0) {
-    outputs[0] = init;
-  } else {
-    outputs[0] = exclusive_result;
-  }
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
-    T output = binary_op(input, outputs[i - 1]);
-    input = inputs[i];
-    outputs[i] = output;
-  }
-}
-
-/// Perform an exclusive scan over the values of input from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param input Input data for the scan operation.
-/// \param init initial value of the scan result.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
-/// in the work-items of the group. \returns exclusive scan of the first i
-/// work-items where item is the i-th work item.
-template <typename Item, typename T, class BinaryOperation>
-__dpct_inline__ T
-exclusive_scan(const Item &item, T input, T init, BinaryOperation binary_op,
-               T &group_aggregate) {
-  T output = detail::__exclusive_scan_over_group(item.get_group(), input, init,
-                                                 binary_op);
-  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
-    group_aggregate = binary_op(output, input);
-  }
-
-  group_aggregate = detail::__group_broadcast(
-      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
-  return output;
-}
-
-/// Perform an exclusive scan over the values of input from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param input Input data for the scan operation.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \param prefix_callback_op functor invoked by the first
-/// work-item in the group that returns the
-///        initial value in the resulting scan of the work-items in the group.
-/// \returns exclusive scan of the input elements assigned to work-items in the
-/// group.
-template <typename Item, typename T, class BinaryOperation,
-          class GroupPrefixCallbackOperation>
-__dpct_inline__ T
-exclusive_scan(const Item &item, T input, BinaryOperation binary_op,
-               GroupPrefixCallbackOperation &prefix_callback_op) {
-  T group_aggregate;
-
-  T output =
-      detail::__exclusive_scan_over_group(item.get_group(), input, binary_op);
-  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
-    group_aggregate = binary_op(output, input);
-  }
-
-  group_aggregate = detail::__group_broadcast(
-      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
-
-  T group_prefix = prefix_callback_op(group_aggregate);
-  if (item.get_local_linear_id() == 0) {
-    output = group_prefix;
-  } else {
-    output = binary_op(group_prefix, output);
-  }
-
-  return output;
-}
-
-namespace detail {
-
-typedef uint16_t digit_counter_type;
-typedef uint32_t packed_counter_type;
-
-template <int N, int CURRENT_VAL = N, int COUNT = 0> struct log2 {
-  enum { VALUE = log2<N, (CURRENT_VAL >> 1), COUNT + 1>::VALUE };
-};
-
-template <int N, int COUNT> struct log2<N, 0, COUNT> {
-  enum { VALUE = (1 << (COUNT - 1) < N) ? COUNT : COUNT - 1 };
-};
-
-template <int RADIX_BITS, bool DESCENDING = false> class radix_rank {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    return group_threads * PADDED_COUNTER_LANES * sizeof(packed_counter_type);
-  }
-
-  radix_rank(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  template <typename Item, int VALUES_PER_THREAD>
-  __dpct_inline__ void
-  rank_keys(const Item &item, uint32_t (&keys)[VALUES_PER_THREAD],
-            int (&ranks)[VALUES_PER_THREAD], int current_bit, int num_bits) {
-
-    digit_counter_type thread_prefixes[VALUES_PER_THREAD];
-    digit_counter_type *digit_counters[VALUES_PER_THREAD];
-    digit_counter_type *buffer =
-        reinterpret_cast<digit_counter_type *>(_local_memory);
-
-    reset_local_memory(item);
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      uint32_t digit = ::dpct::bfe(keys[i], current_bit, num_bits);
-      uint32_t sub_counter = digit >> LOG_COUNTER_LANES;
-      uint32_t counter_lane = digit & (COUNTER_LANES - 1);
-
-      if (DESCENDING) {
-        sub_counter = PACKING_RATIO - 1 - sub_counter;
-        counter_lane = COUNTER_LANES - 1 - counter_lane;
-      }
-
-      digit_counters[i] =
-          &buffer[counter_lane * item.get_local_range().size() * PACKING_RATIO +
-                  item.get_local_linear_id() * PACKING_RATIO + sub_counter];
-      thread_prefixes[i] = *digit_counters[i];
-      *digit_counters[i] = thread_prefixes[i] + 1;
-    }
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-    scan_counters(item);
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      ranks[i] = thread_prefixes[i] + *digit_counters[i];
-    }
-  }
-
-private:
-  template <typename Item>
-  __dpct_inline__ void reset_local_memory(const Item &item) {
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      ptr[i * item.get_local_range().size() + item.get_local_linear_id()] = 0;
-    }
-  }
-
-  template <typename Item>
-  __dpct_inline__ packed_counter_type upsweep(const Item &item) {
-    packed_counter_type sum = 0;
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; i++) {
-      cached_segment[i] =
-          ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i];
-    }
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      sum += cached_segment[i];
-    }
-
-    return sum;
-  }
-
-  template <typename Item>
-  __dpct_inline__ void
-  exclusive_downsweep(const Item &item, packed_counter_type raking_partial) {
-    packed_counter_type *ptr =
-        reinterpret_cast<packed_counter_type *>(_local_memory);
-    packed_counter_type sum = raking_partial;
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      packed_counter_type value = cached_segment[i];
-      cached_segment[i] = sum;
-      sum += value;
-    }
-
-#pragma unroll
-    for (int i = 0; i < PADDED_COUNTER_LANES; ++i) {
-      ptr[item.get_local_linear_id() * PADDED_COUNTER_LANES + i] =
-          cached_segment[i];
-    }
-  }
-
-  struct prefix_callback {
-    __dpct_inline__ packed_counter_type
-    operator()(packed_counter_type block_aggregate) {
-      packed_counter_type block_prefix = 0;
-
-#pragma unroll
-      for (int packed = 1; packed < PACKING_RATIO; packed++) {
-        block_prefix += block_aggregate
-                        << (sizeof(digit_counter_type) * 8 * packed);
-      }
-
-      return block_prefix;
-    }
-  };
-
-  template <typename Item>
-  __dpct_inline__ void scan_counters(const Item &item) {
-    packed_counter_type raking_partial = upsweep(item);
-
-    prefix_callback callback;
-    packed_counter_type exclusive_partial = exclusive_scan(
-        item, raking_partial, sycl::ext::oneapi::plus<packed_counter_type>(),
-        callback);
-
-    exclusive_downsweep(item, exclusive_partial);
-  }
-
-private:
-  static constexpr int PACKING_RATIO =
-      sizeof(packed_counter_type) / sizeof(digit_counter_type);
-  static constexpr int LOG_PACKING_RATIO = log2<PACKING_RATIO>::VALUE;
-  static constexpr int LOG_COUNTER_LANES = RADIX_BITS - LOG_PACKING_RATIO;
-  static constexpr int COUNTER_LANES = 1 << LOG_COUNTER_LANES;
-  static constexpr int PADDED_COUNTER_LANES = COUNTER_LANES + 1;
-
-  packed_counter_type cached_segment[PADDED_COUNTER_LANES];
-  uint8_t *_local_memory;
-};
-
-template <typename T, typename U> struct base_traits {
-
-  static __dpct_inline__ U twiddle_in(U key) {
-    throw std::runtime_error("Not implemented");
-  }
-  static __dpct_inline__ U twiddle_out(U key) {
-    throw std::runtime_error("Not implemented");
-  }
-};
-
-template <typename U> struct base_traits<uint32_t, U> {
-  static __dpct_inline__ U twiddle_in(U key) { return key; }
-  static __dpct_inline__ U twiddle_out(U key) { return key; }
-};
-
-template <typename U> struct base_traits<int, U> {
-  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
-  static __dpct_inline__ U twiddle_in(U key) { return key ^ HIGH_BIT; }
-  static __dpct_inline__ U twiddle_out(U key) { return key ^ HIGH_BIT; }
-};
-
-template <typename U> struct base_traits<float, U> {
-  static constexpr U HIGH_BIT = U(1) << ((sizeof(U) * 8) - 1);
-  static __dpct_inline__ U twiddle_in(U key) {
-    U mask = (key & HIGH_BIT) ? U(-1) : HIGH_BIT;
-    return key ^ mask;
-  }
-  static __dpct_inline__ U twiddle_out(U key) {
-    U mask = (key & HIGH_BIT) ? HIGH_BIT : U(-1);
-    return key ^ mask;
-  }
-};
-
-template <typename T> struct traits : base_traits<T, T> {};
-template <> struct traits<uint32_t> : base_traits<uint32_t, uint32_t> {};
-template <> struct traits<int> : base_traits<int, uint32_t> {};
-template <> struct traits<float> : base_traits<float, uint32_t> {};
-
-} // namespace detail
-
-namespace detail {
-
-template <int N> struct power_of_two {
-  enum { VALUE = ((N & (N - 1)) == 0) };
-};
-
-__dpct_inline__ uint32_t shr_add(uint32_t x, uint32_t shift, uint32_t addend) {
-  return (x >> shift) + addend;
-}
-
-} // namespace detail
-
-/// Implements scatter to blocked exchange pattern used in radix sort algorithm.
-///
-/// \tparam T type of the data elements exchanges
-/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
-template <typename T, int VALUES_PER_THREAD> class exchange {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    size_t padding_values =
-        (INSERT_PADDING)
-            ? ((group_threads * VALUES_PER_THREAD) >> LOG_LOCAL_MEMORY_BANKS)
-            : 0;
-    return (group_threads * VALUES_PER_THREAD + padding_values) * sizeof(T);
-  }
-
-  exchange(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  /// Rearrange elements from rank order to blocked order
-  template <typename Item>
-  __dpct_inline__ void
-  scatter_to_blocked(Item item, T (&keys)[VALUES_PER_THREAD],
-                     int (&ranks)[VALUES_PER_THREAD]) {
-    T *buffer = reinterpret_cast<T *>(_local_memory);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; i++) {
-      int offset = ranks[i];
-      if (INSERT_PADDING)
-        offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
-      buffer[offset] = keys[i];
-    }
-
-    item.barrier(sycl::access::fence_space::local_space);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; i++) {
-      int offset = (item.get_local_id(0) * VALUES_PER_THREAD) + i;
-      if (INSERT_PADDING)
-        offset = detail::shr_add(offset, LOG_LOCAL_MEMORY_BANKS, offset);
-      keys[i] = buffer[offset];
-    }
-  }
-
-private:
-  static constexpr int LOG_LOCAL_MEMORY_BANKS = 5;
-  static constexpr bool INSERT_PADDING =
-      (VALUES_PER_THREAD > 4) &&
-      (detail::power_of_two<VALUES_PER_THREAD>::VALUE);
-
-  uint8_t *_local_memory;
-};
-
-/// Implements radix sort to sort integer data elements assigned to all threads
-/// in the group.
-///
-/// \tparam T type of the data elements exchanges
-/// \tparam VALUES_PER_THREAD number of data elements assigned to a thread
-/// \tparam DECENDING boolean value indicating if data elements are sorted in
-/// decending order.
-template <typename T, int VALUES_PER_THREAD, bool DESCENDING = false>
-class radix_sort {
-public:
-  static size_t get_local_memory_size(size_t group_threads) {
-    size_t ranks_size =
-        detail::radix_rank<RADIX_BITS>::get_local_memory_size(group_threads);
-    size_t exchange_size =
-        exchange<T, VALUES_PER_THREAD>::get_local_memory_size(group_threads);
-    return sycl::max(ranks_size, exchange_size);
-  }
-
-  radix_sort(uint8_t *local_memory) : _local_memory(local_memory) {}
-
-  template <typename Item>
-  __dpct_inline__ void
-  sort(const Item &item, T (&keys)[VALUES_PER_THREAD], int begin_bit = 0,
-       int end_bit = 8 * sizeof(T)) {
-
-    uint32_t(&unsigned_keys)[VALUES_PER_THREAD] =
-        reinterpret_cast<uint32_t(&)[VALUES_PER_THREAD]>(keys);
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      unsigned_keys[i] = detail::traits<T>::twiddle_in(unsigned_keys[i]);
-    }
-
-    while (true) {
-      int pass_bits = sycl::min(RADIX_BITS, end_bit - begin_bit);
-
-      int ranks[VALUES_PER_THREAD];
-      detail::radix_rank<RADIX_BITS, DESCENDING>(_local_memory)
-          .template rank_keys(item, unsigned_keys, ranks, begin_bit, pass_bits);
-      begin_bit += RADIX_BITS;
-
-      item.barrier(sycl::access::fence_space::local_space);
-
-      exchange<T, VALUES_PER_THREAD>(_local_memory)
-          .scatter_to_blocked(item, keys, ranks);
-
-      item.barrier(sycl::access::fence_space::local_space);
-
-      if (begin_bit >= end_bit)
-        break;
-    }
-
-#pragma unroll
-    for (int i = 0; i < VALUES_PER_THREAD; ++i) {
-      unsigned_keys[i] = detail::traits<T>::twiddle_out(unsigned_keys[i]);
-    }
-  }
-
-private:
-  static constexpr int RADIX_BITS = 4;
-
-  uint8_t *_local_memory;
-};
-
-/// Perform a reduction of the data elements assigned to all threads in the
-/// group.
-///
-/// \param item A work-item in a group.
-/// \param inputs Pointer to the input data for the reduce operation.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \returns value of the reduction using binary_op
-template <typename Item, typename T, class BinaryOperation,
-          int VALUES_PER_THREAD>
-__dpct_inline__ T
-reduce(Item item, T (&inputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
-  T result = inputs[0];
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; i++) {
-    result = binary_op(result, inputs[i]);
-  }
-  return detail::__reduce_over_group(item.get_group(), result, binary_op);
-}
-
-/// Perform a reduction on a limited number of the work items in a subgroup
-///
-/// \param item A work-item in a group.
-/// \param value value per work item which is to be reduced
-/// \param items_to_reduce num work items at the start of the subgroup to reduce
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \returns value of the reduction using binary_op
-template <typename Item, typename T, class BinaryOperation>
-__dpct_inline__
-typename ::std::enable_if_t<sycl::has_known_identity_v<BinaryOperation, T>, T>
-reduce_over_partial_group(const Item &item, const T &value,
-                          const ::std::uint16_t &items_to_reduce,
-                          BinaryOperation binary_op) {
-  T value_temp = (item.get_local_linear_id() < items_to_reduce)
-                     ? value
-                     : sycl::known_identity_v<BinaryOperation, T>;
-  return detail::__reduce_over_group(item.get_sub_group(), value_temp,
-                                     binary_op);
-}
-
-/// Perform an inclusive scan over the values of inputs from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param inputs Pointer to the input data for the scan operation.
-/// \param outputs Pointer to the location where scan results will be stored.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \returns inclusive scan of the input elements assigned to
-/// work-items in the group.
-template <typename Item, typename T, class BinaryOperation,
-          int VALUES_PER_THREAD>
-__dpct_inline__ void
-inclusive_scan(const Item &item, T (&inputs)[VALUES_PER_THREAD],
-               T (&outputs)[VALUES_PER_THREAD], BinaryOperation binary_op) {
-  T result = inputs[0];
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
-    result = binary_op(result, inputs[i]);
-  }
-
-  T exclusive_result =
-      detail::__exclusive_scan_over_group(item.get_group(), result, binary_op);
-
-  if (item.get_local_linear_id() == 0) {
-    outputs[0] = inputs[0];
-  } else {
-    outputs[0] = binary_op(inputs[0], exclusive_result);
-  }
-
-#pragma unroll
-  for (int i = 1; i < VALUES_PER_THREAD; ++i) {
-    outputs[i] = binary_op(inputs[i], outputs[i - 1]);
-  }
-}
-
-/// Perform an inclusive scan over the values of inputs from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param input Pointer to the input data for the scan operation.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \param group_aggregate group-wide aggregate of all inputs
-/// in the work-items of the group. \returns inclusive scan of the input
-/// elements assigned to work-items in the group.
-template <typename Item, typename T, class BinaryOperation>
-__dpct_inline__ T inclusive_scan(const Item &item, T input,
-                                                BinaryOperation binary_op,
-                                                T &group_aggregate) {
-  T output =
-      detail::__inclusive_scan_over_group(item.get_group(), input, binary_op);
-  if (item.get_local_linear_id() == item.get_local_range().size() - 1) {
-    group_aggregate = output;
-  }
-
-  group_aggregate = detail::__group_broadcast(
-      item.get_group(), group_aggregate, item.get_local_range().size() - 1);
-  return output;
-}
-
-/// Perform an inclusive scan over the values of input from all work-items in
-/// the group using the operator binary_op, which must be one of the SYCL 2020
-/// group algorithms library function objects.
-///
-/// \param item A work-item in a group.
-/// \param input Input data for the scan operation.
-/// \param binary_op functor that implements the binary operation used to
-/// perform the scan. \param prefix_callback_op functor invoked by the first
-/// work-item in the group that returns the
-///        initial value in the resulting scan of the work-items in the group.
-/// \returns inclusive scan of the input elements assigned to work-items in the
-/// group.
-template <typename Item, typename T, class BinaryOperation,
-          class GroupPrefixCallbackOperation>
-__dpct_inline__ T
-inclusive_scan(const Item &item, T input, BinaryOperation binary_op,
-               GroupPrefixCallbackOperation &prefix_callback_op) {
-  T group_aggregate;
-
-  T output = inclusive_scan(item, input, binary_op, group_aggregate);
-  T group_prefix = prefix_callback_op(group_aggregate);
-
-  return binary_op(group_prefix, output);
-}
-
-} // namespace group
-
-namespace device {
-
-namespace detail {
-
-template <typename... _Args> constexpr auto __joint_reduce(_Args... __args) {
-  return sycl::joint_reduce(__args...);
-}
-
-} // namespace detail
-
-/// Perform a reduce on each of the segments specified within data stored on
-/// the device.
-///
-/// \param queue Command queue used to access device used for reduction
-/// \param inputs Pointer to the data elements on the device to be reduced
-/// \param outputs Pointer to the storage where the reduced value for each
-/// segment will be stored \param segment_count number of segments to be reduced
-/// \param begin_offsets Pointer to the set of indices that are the first
-/// element in each segment \param end_offsets Pointer to the set of indices
-/// that are one past the last element in each segment \param binary_op functor
-/// that implements the binary operation used to perform the scan. \param init
-/// initial value of the reduction for each segment.
-template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
-void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
-                      size_t segment_count, OffsetT *begin_offsets,
-                      OffsetT *end_offsets, BinaryOperation binary_op, T init) {
-
-  sycl::range<1> global_size(segment_count * GROUP_SIZE);
-  sycl::range<1> local_size(GROUP_SIZE);
-
-  queue.submit([&](sycl::handler &cgh) {
-    cgh.parallel_for(
-        sycl::nd_range<1>(global_size, local_size), [=](sycl::nd_item<1> item) {
-          OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
-          OffsetT segment_end = end_offsets[item.get_group_linear_id()];
-          if (segment_begin == segment_end) {
-            if (item.get_local_linear_id() == 0) {
-              outputs[item.get_group_linear_id()] = init;
-            }
-            return;
-          }
-
-          sycl::multi_ptr<T, sycl::access::address_space::global_space>
-              input_ptr = inputs;
-          T group_aggregate = detail::__joint_reduce(
-              item.get_group(), input_ptr + segment_begin,
-              input_ptr + segment_end, init, binary_op);
-
-          if (item.get_local_linear_id() == 0) {
-            outputs[item.get_group_linear_id()] = group_aggregate;
-          }
-        });
-  });
-}
-
-
-#ifdef SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
-
-namespace experimental {
-namespace detail {
-template <typename _Tp, typename... _Ts> struct __is_any {
-  constexpr static bool value = std::disjunction_v<
-      std::is_same<std::remove_cv_t<_Tp>, std::remove_cv_t<_Ts>>...>;
-};
-
-template <typename _Tp, typename _Bp> struct __in_native_op_list {
-  constexpr static bool value =
-      __is_any<_Bp, sycl::plus<_Tp>, sycl::bit_or<_Tp>, sycl::bit_xor<_Tp>,
-               sycl::bit_and<_Tp>, sycl::maximum<_Tp>, sycl::minimum<_Tp>,
-               sycl::multiplies<_Tp>>::value;
-};
-
-template <typename _Tp, typename _Bp> struct __is_native_op {
-  constexpr static bool value = __in_native_op_list<_Tp, _Bp>::value ||
-                                __in_native_op_list<void, _Bp>::value;
-};
-
-} // namespace detail
-
-/// Perform a reduce on each of the segments specified within data stored on
-/// the device. Compared with dpct::device::segmented_reduce, this experimental
-/// feature support user define reductions.
-///
-/// \param queue Command queue used to access device used for reduction
-/// \param inputs Pointer to the data elements on the device to be reduced
-/// \param outputs Pointer to the storage where the reduced value for each
-/// segment will be stored \param segment_count number of segments to be reduced
-/// \param begin_offsets Pointer to the set of indices that are the first
-/// element in each segment \param end_offsets Pointer to the set of indices
-/// that are one past the last element in each segment \param binary_op functor
-/// that implements the binary operation used to perform the scan. \param init
-/// initial value of the reduction for each segment.
-template <int GROUP_SIZE, typename T, typename OffsetT, class BinaryOperation>
-void segmented_reduce(sycl::queue queue, T *inputs, T *outputs,
-                      size_t segment_count, OffsetT *begin_offsets,
-                      OffsetT *end_offsets, BinaryOperation binary_op, T init) {
-
-  sycl::range<1> global_size(segment_count * GROUP_SIZE);
-  sycl::range<1> local_size(GROUP_SIZE);
-
-  if constexpr (!detail::__is_native_op<T, BinaryOperation>::value) {
-    queue.submit([&](sycl::handler &cgh) {
-      size_t temp_memory_size = GROUP_SIZE * sizeof(T);
-      auto scratch = sycl::local_accessor<std::byte, 1>(temp_memory_size, cgh);
-      cgh.parallel_for(
-          sycl::nd_range<1>(global_size, local_size),
-          [=](sycl::nd_item<1> item) {
-            OffsetT segment_begin = begin_offsets[item.get_group_linear_id()];
-            OffsetT segment_end = end_offsets[item.get_group_linear_id()];
-            if (segment_begin == segment_end) {
-              if (item.get_local_linear_id() == 0) {
-                outputs[item.get_group_linear_id()] = init;
-              }
-              return;
-            }
-            // Create a handle that associates the group with an allocation it
-            // can use
-            auto handle =
-                sycl::ext::oneapi::experimental::group_with_scratchpad(
-                    item.get_group(),
-                    sycl::span(&scratch[0], temp_memory_size));
-            T group_aggregate = sycl::ext::oneapi::experimental::joint_reduce(
-                handle, inputs + segment_begin, inputs + segment_end, init,
-                binary_op);
-            if (item.get_local_linear_id() == 0) {
-              outputs[item.get_group_linear_id()] = group_aggregate;
-            }
-          });
-    });
-  } else {
-    dpct::device::segmented_reduce<GROUP_SIZE>(queue, inputs, outputs,
-                                               segment_count, begin_offsets,
-                                               end_offsets, binary_op, init);
-  }
-}
-} // namespace experimental
-
-#endif // SYCL_EXT_ONEAPI_USER_DEFINED_REDUCTIONS
-
-
-} // namespace device
-} // namespace dpct
-
-#endif
diff --git a/dpct/dpl_extras/functional.h b/dpct/dpl_extras/functional.h
deleted file mode 100644
index bab82814c2103..0000000000000
--- a/dpct/dpl_extras/functional.h
+++ /dev/null
@@ -1,453 +0,0 @@
-//==---- functional.h -----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_FUNCTIONAL_H__
-#define __DPCT_FUNCTIONAL_H__
-
-#include <functional>
-#include <oneapi/dpl/functional>
-#include <oneapi/dpl/iterator>
-
-#if ONEDPL_USE_DPCPP_BACKEND
-#include <oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h>
-#endif
-
-#include <tuple>
-#include <utility>
-
-#include "../dpct.hpp"
-#define _DPCT_GCC_VERSION                                                      \
-  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-
-// Portability "#pragma" definition
-#ifdef _MSC_VER
-#define _DPCT_PRAGMA(x) __pragma(x)
-#else
-#define _DPCT_PRAGMA(x) _Pragma(#x)
-#endif
-
-// Enable loop unrolling pragmas where supported
-#if (__INTEL_COMPILER ||                                                       \
-     (!defined(__INTEL_COMPILER) && _DPCT_GCC_VERSION >= 80000))
-#define _DPCT_PRAGMA_UNROLL _DPCT_PRAGMA(unroll)
-#else // no pragma unroll
-#define _DPCT_PRAGMA_UNROLL
-#endif
-
-namespace dpct {
-
-struct null_type {};
-
-// Function object to wrap user defined functors to provide compile time "const"
-// workaround for user function objects.
-// The SYCL spec (4.12) states that writing to a function object during a SYCL
-// kernel is undefined behavior.  This wrapper is provided as a compile-time
-// work around, but functors used in SYCL kernels must be `const` in practice.
-template <typename _Op> struct mark_functor_const {
-  mutable _Op op;
-  mark_functor_const() : op() {}
-  mark_functor_const(const _Op &__op) : op(__op) {}
-  mark_functor_const(_Op &&__op) : op(::std::move(__op)) {}
-  template <typename... _T> auto operator()(_T &&...x) const {
-    return op(std::forward<_T>(x)...);
-  }
-};
-
-namespace internal {
-
-template <class _ExecPolicy, class _T>
-using enable_if_execution_policy =
-    typename std::enable_if<oneapi::dpl::execution::is_execution_policy<
-                                typename std::decay<_ExecPolicy>::type>::value,
-                            _T>::type;
-
-template <typename _T> struct is_hetero_execution_policy : ::std::false_type {};
-
-template <typename... PolicyParams>
-struct is_hetero_execution_policy<
-    oneapi::dpl::execution::device_policy<PolicyParams...>> : ::std::true_type {
-};
-
-template <typename _T> struct is_fpga_execution_policy : ::std::false_type {};
-
-#if _ONEDPL_FPGA_DEVICE
-template <unsigned int unroll_factor, typename... PolicyParams>
-struct is_hetero_execution_policy<
-    execution::fpga_policy<unroll_factor, PolicyParams...>> : ::std::true_type {
-};
-#endif
-
-template <class _ExecPolicy, class _T>
-using enable_if_hetero_execution_policy = typename std::enable_if<
-    is_hetero_execution_policy<typename std::decay<_ExecPolicy>::type>::value,
-    _T>::type;
-
-#if _ONEDPL_CPP14_INTEGER_SEQUENCE_PRESENT
-
-template <std::size_t... _Sp>
-using index_sequence = std::index_sequence<_Sp...>;
-template <std::size_t _Np>
-using make_index_sequence = std::make_index_sequence<_Np>;
-
-#else
-
-template <std::size_t... _Sp> class index_sequence {};
-
-template <std::size_t _Np, std::size_t... _Sp>
-struct make_index_sequence_impl
-    : make_index_sequence_impl<_Np - 1, _Np - 1, _Sp...> {};
-
-template <std::size_t... _Sp> struct make_index_sequence_impl<0, _Sp...> {
-  using type = index_sequence<_Sp...>;
-};
-
-template <std::size_t _Np>
-using make_index_sequence = typename make_index_sequence_impl<_Np>::type;
-#endif
-
-// Minimal buffer implementations for temporary storage in mapping rules
-// Some of our algorithms need to start with raw memory buffer,
-// not an initialized array, because initialization/destruction
-// would make the span be at least O(N).
-#if ONEDPL_USE_DPCPP_BACKEND
-template <typename _Tp> class __buffer {
-  sycl::buffer<_Tp, 1> __buf;
-
-  __buffer(const __buffer &) = delete;
-
-  void operator=(const __buffer &) = delete;
-
-public:
-  // Try to obtain buffer of given size to store objects of _Tp type
-  __buffer(std::size_t __n) : __buf(sycl::range<1>(__n)) {}
-
-  // Return pointer to buffer, or  NULL if buffer could not be obtained.
-  auto get() -> decltype(oneapi::dpl::begin(__buf)) const {
-    return oneapi::dpl::begin(__buf);
-  }
-};
-#else
-template <typename _Tp> class __buffer {
-  std::unique_ptr<_Tp> _M_ptr;
-
-  __buffer(const __buffer &) = delete;
-
-  void operator=(const __buffer &) = delete;
-
-public:
-  // Try to obtain buffer of given size to store objects of _Tp type
-  __buffer(const std::size_t __n) : _M_ptr(new _Tp[__n]) {}
-
-  // Return pointer to buffer, or  NULL if buffer could not be obtained.
-  _Tp *get() const { return _M_ptr.get(); }
-};
-#endif
-
-// Implements C++14 std::less<void> specialization to allow parameter type
-// deduction.
-class __less {
-public:
-  template <typename _Xp, typename _Yp>
-  bool operator()(_Xp &&__x, _Yp &&__y) const {
-    return std::forward<_Xp>(__x) < std::forward<_Yp>(__y);
-  }
-};
-
-template <typename Policy, typename NewName> struct rebind_policy {
-  using type = Policy;
-};
-
-template <typename KernelName, typename NewName>
-struct rebind_policy<oneapi::dpl::execution::device_policy<KernelName>,
-                     NewName> {
-  using type = oneapi::dpl::execution::device_policy<NewName>;
-};
-
-#if _ONEDPL_FPGA_DEVICE
-template <unsigned int factor, typename KernelName, typename NewName>
-struct rebind_policy<oneapi::dpl::execution::fpga_policy<factor, KernelName>,
-                     NewName> {
-  using type = oneapi::dpl::execution::fpga_policy<factor, NewName>;
-};
-#endif
-
-template <typename T1, typename T2,
-          typename R1 = typename std::iterator_traits<T1>::reference,
-          typename R2 = typename std::iterator_traits<T2>::reference>
-struct perm_fun {
-  typedef R2 result_of;
-  perm_fun(T1 input) : source(input) {}
-
-  R2 operator()(R1 x) const { return *(source + x); }
-
-private:
-  T1 source;
-};
-
-// Functor compares first element (key) from tied sequence.
-template <typename Compare = class internal::__less> struct compare_key_fun {
-  typedef bool result_of;
-  compare_key_fun(Compare _comp = internal::__less()) : comp(_comp) {}
-
-  template <typename _T1, typename _T2>
-  result_of operator()(_T1 &&a, _T2 &&b) const {
-    using std::get;
-    return comp(get<0>(a), get<0>(b));
-  }
-
-private:
-  mutable Compare comp;
-};
-
-// Functor evaluates second element of tied sequence with predicate.
-// Used by: copy_if, remove_copy_if, stable_partition_copy
-// Lambda:
-template <typename Predicate> struct predicate_key_fun {
-  typedef bool result_of;
-  predicate_key_fun(Predicate _pred) : pred(_pred) {}
-
-  template <typename _T1> result_of operator()(_T1 &&a) const {
-    using std::get;
-    return pred(get<1>(a));
-  }
-
-private:
-  mutable Predicate pred;
-};
-
-// Used by: remove_if
-template <typename Predicate> struct negate_predicate_key_fun {
-  typedef bool result_of;
-  negate_predicate_key_fun(Predicate _pred) : pred(_pred) {}
-
-  template <typename _T1> result_of operator()(_T1 &&a) const {
-    using std::get;
-    return !pred(get<1>(a));
-  }
-
-private:
-  mutable Predicate pred;
-};
-
-template <typename T> struct sequence_fun {
-  using result_type = T;
-  sequence_fun(T _init, T _step) : init(_init), step(_step) {}
-
-  template <typename _T> result_type operator()(_T &&i) const {
-    return static_cast<T>(init + step * i);
-  }
-
-private:
-  const T init;
-  const T step;
-};
-
-//[binary_pred](Ref a, Ref b){ return(binary_pred(get<0>(a),get<0>(b)));
-template <typename Predicate> struct unique_fun {
-  typedef bool result_of;
-  unique_fun(Predicate _pred) : pred(_pred) {}
-  template <typename _T> result_of operator()(_T &&a, _T &&b) const {
-    using std::get;
-    return pred(get<0>(a), get<0>(b));
-  }
-
-private:
-  mutable Predicate pred;
-};
-
-// Lambda: [pred, &new_value](Ref1 a, Ref2 s) {return pred(s) ? new_value : a;
-// });
-template <typename T, typename Predicate> struct replace_if_fun {
-public:
-  typedef T result_of;
-  replace_if_fun(Predicate _pred, T _new_value)
-      : pred(_pred), new_value(_new_value) {}
-
-  template <typename _T1, typename _T2> T operator()(_T1 &&a, _T2 &&s) const {
-    return pred(s) ? new_value : a;
-  }
-
-private:
-  mutable Predicate pred;
-  const T new_value;
-};
-
-//[pred,op](Ref a){return pred(a) ? op(a) : a; }
-template <typename T, typename Predicate, typename Operator>
-struct transform_if_fun {
-  transform_if_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
-  template <typename _T>
-  void operator()(_T&& t) const {
-    using std::get;
-    if (pred(get<0>(t)))
-      get<1>(t) = op(get<0>(t));
-  }
-
-private:
-  mutable Predicate pred;
-  mutable Operator op;
-};
-
-//[pred, op](Ref1 a, Ref2 s) { return pred(s) ? op(a) : a; });
-template <typename T, typename Predicate, typename Operator>
-struct transform_if_unary_zip_mask_fun {
-  transform_if_unary_zip_mask_fun(Predicate _pred, Operator _op) : pred(_pred), op(_op) {}
-  template <typename _T>
-  void operator()(_T&& t) const {
-    using std::get;
-    if (pred(get<1>(t)))
-      get<2>(t) = op(get<0>(t));
-  }
-
-private:
-  mutable Predicate pred;
-  mutable Operator op;
-};
-
-template <typename T, typename Predicate, typename BinaryOperation>
-class transform_if_zip_mask_fun {
-public:
-  transform_if_zip_mask_fun(Predicate _pred = oneapi::dpl::identity(),
-                            BinaryOperation _op = oneapi::dpl::identity())
-      : pred(_pred), op(_op) {}
-  template <typename _T> void operator()(_T &&t) const {
-    using std::get;
-    if (pred(get<2>(t)))
-      get<3>(t) = op(get<0>(t), get<1>(t));
-  }
-
-private:
-  mutable Predicate pred;
-  mutable BinaryOperation op;
-};
-
-// This following code is similar to a section of code in
-// oneDPL/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_radix_sort.h
-// It has a similar approach, and could be consolidated.
-// Outside of some differences in approach, there are two significant
-// differences in function.
-//
-// 1) This code allows the output type of the bit range translation to be fit
-// into to the minimal type required to provide that many bits. The code in
-// oneDPL to calculate the bucket for the radix is similar but its output is
-// always std::uint32_t.  The assumption that the bit range desired will fit in
-// 32 bits is not true for this code.
-//
-// 2) This code ensures that for floating point type, -0.0f and 0.0f map to the
-// same value.  This allows the output of this translation to be used to provide
-// a sort which ensures the stability of these values for floating point types.
-
-template <int N> struct uint_byte_map {};
-template <> struct uint_byte_map<1> { using type = uint8_t; };
-template <> struct uint_byte_map<2> { using type = uint16_t; };
-template <> struct uint_byte_map<4> { using type = uint32_t; };
-template <> struct uint_byte_map<8> { using type = uint64_t; };
-
-template <typename T> struct uint_map {
-  using type = typename uint_byte_map<sizeof(T)>::type;
-};
-
-template <typename T, typename OutKeyT> class translate_key {
-  using uint_type_t = typename uint_map<T>::type;
-
-public:
-  translate_key(int begin_bit, int end_bit) {
-    shift = begin_bit;
-    mask = ~OutKeyT(0); // all ones
-    mask = mask >> (sizeof(OutKeyT) * 8 -
-                    (end_bit - begin_bit));           // setup appropriate mask
-    flip_sign = uint_type_t(1) << (sizeof(uint_type_t) * 8 - 1); // sign bit
-    flip_key = ~uint_type_t(0);                       // 0xF...F
-  }
-
-  inline OutKeyT operator()(const T &key) const {
-    uint_type_t intermediate;
-    if constexpr (std::is_floating_point<T>::value) {
-        // normal case (both -0.0f and 0.0f equal -0.0f)
-        if (key != T(-0.0f)) {
-        uint_type_t is_negative = reinterpret_cast<const uint_type_t &>(key) >>
-              (sizeof(uint_type_t) * 8 - 1);
-          intermediate = reinterpret_cast<const uint_type_t &>(key) ^
-                         ((is_negative * flip_key) | flip_sign);
-        } else // special case for -0.0f to keep stability with 0.0f
-        {
-          T negzero = T(-0.0f);
-          intermediate = reinterpret_cast<const uint_type_t &>(negzero);
-        }
-    } else if constexpr (std::is_signed<T>::value) {
-        intermediate = reinterpret_cast<const uint_type_t &>(key) ^ flip_sign;
-    } else {
-      intermediate = key;
-    }
-
-    return static_cast<OutKeyT>(intermediate >> shift) &
-           mask; // shift, cast, and mask
-  }
-
-private:
-  uint8_t shift;
-  OutKeyT mask;
-  uint_type_t flip_sign;
-  uint_type_t flip_key;
-};
-
-// Unary operator that returns reference to its argument. Ported from
-// oneDPL: oneapi/dpl/pstl/utils.h
-struct no_op_fun {
-  template <typename Tp> Tp &&operator()(Tp &&a) const {
-    return ::std::forward<Tp>(a);
-  }
-};
-
-// Unary functor which composes a pair of functors by calling them in succession
-// on an input
-template <typename FunctorInner, typename FunctorOuter>
-struct __composition_functor {
-  __composition_functor(FunctorInner in, FunctorOuter out)
-      : _in(in), _out(out) {}
-  template <typename T> T operator()(const T &i) const {
-    return _out(_in(i));
-  }
-  FunctorInner _in;
-  FunctorOuter _out;
-};
-
-// Unary functor which maps an index of a ROI into a 2D flattened array
-template <typename OffsetT> struct __roi_2d_index_functor {
-  __roi_2d_index_functor(const OffsetT &num_cols,
-                         const ::std::size_t &row_stride)
-      : _num_cols(num_cols), _row_stride(row_stride) {}
-
-  template <typename Index> Index operator()(const Index &i) const {
-    return _row_stride * (i / _num_cols) + (i % _num_cols);
-  }
-
-  OffsetT _num_cols;
-  ::std::size_t _row_stride;
-};
-
-// Unary functor which maps and index into an interleaved array by its active
-// channel
-template <typename OffsetT> struct __interleaved_index_functor {
-  __interleaved_index_functor(const OffsetT &total_channels,
-                              const OffsetT &active_channel)
-      : _total_channels(total_channels), _active_channel(active_channel) {}
-
-  template <typename Index> Index operator()(const Index &i) const {
-    return i * _total_channels + _active_channel;
-  }
-
-  OffsetT _total_channels;
-  OffsetT _active_channel;
-};
-
-} // end namespace internal
-
-} // end namespace dpct
-
-#endif
diff --git a/dpct/dpl_extras/iterators.h b/dpct/dpl_extras/iterators.h
deleted file mode 100644
index 2e1d10986728e..0000000000000
--- a/dpct/dpl_extras/iterators.h
+++ /dev/null
@@ -1,347 +0,0 @@
-//==---- iterators.h ------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_ITERATORS_H__
-#define __DPCT_ITERATORS_H__
-
-#include <oneapi/dpl/iterator>
-
-#include "functional.h"
-
-namespace dpct {
-
-namespace internal {
-
-// Wrapper class returned from a dereferenced transform_iterator which was
-// created using
-//  make_transform_output_iterator(). Used to apply the supplied transform
-//  function when writing into an object of this class.
-//
-// Example:
-// int a[] = {0, 1, 2, 3, 4};
-// int* p = a;
-// auto f = [](auto v) {return v*v;};
-// auto tr_out = dpct::make_transform_output_iterator(p+1, f);
-// auto wrap = *tr_out;         // wrap is a transform_output_ref_wrapper
-// std::cout<<*(p+1)<<std::endl;  // '1'
-// wrap = 2;                    // apply function, store 2*2=4
-// std::cout<<*(p+1)<<std::endl;  // '4'
-template <typename T, typename _UnaryFunc> class transform_output_ref_wrapper {
-private:
-  T __my_reference_;
-  _UnaryFunc __my_unary_func_;
-
-public:
-  template <typename U>
-  transform_output_ref_wrapper(U &&__reference, _UnaryFunc __unary_func)
-      : __my_reference_(std::forward<U>(__reference)),
-        __my_unary_func_(__unary_func) {}
-
-  // When writing to an object of this type, apply the supplied unary function,
-  // then write to the wrapped reference
-  template <typename UnaryInputType>
-  transform_output_ref_wrapper &operator=(const UnaryInputType &e) {
-    __my_reference_ = __my_unary_func_(e);
-    return *this;
-  }
-};
-
-// Unary functor to create a transform_output_reference_wrapper when a
-// transform_iterator is dereferenced, so that a
-// the supplied unary function may be applied on write, resulting in a
-// transform_output_iterator
-template <typename _UnaryFunc> struct _Unary_Out {
-  _Unary_Out(_UnaryFunc __f_) : __f(__f_) {}
-  _UnaryFunc __f;
-  template <typename T> auto operator()(T &&val) const {
-    return transform_output_ref_wrapper<T, _UnaryFunc>(std::forward<T>(val),
-                                                       __f);
-  }
-};
-
-} // end namespace internal
-
-using std::advance;
-
-using std::distance;
-
-template <typename T>
-oneapi::dpl::counting_iterator<T> make_counting_iterator(const T &input) {
-  return oneapi::dpl::counting_iterator<T>(input);
-}
-
-template <typename _Tp> class constant_iterator {
-public:
-  typedef std::false_type is_hetero;
-  typedef std::true_type is_passed_directly;
-  typedef std::ptrdiff_t difference_type;
-  typedef _Tp value_type;
-  typedef _Tp *pointer;
-  // There is no storage behind the iterator, so we return a value instead of
-  // reference.
-  typedef const _Tp reference;
-  typedef const _Tp const_reference;
-  typedef std::random_access_iterator_tag iterator_category;
-
-  explicit constant_iterator(_Tp __init)
-      : __my_value_(__init), __my_counter_(0) {}
-
-private:
-  // used to construct iterator instances with different counter values required
-  // by arithmetic operators
-  constant_iterator(const _Tp &__value, const difference_type &__offset)
-      : __my_value_(__value), __my_counter_(__offset) {}
-
-public:
-  // non-const variants of access operators are not provided so unintended
-  // writes are caught at compile time.
-  const_reference operator*() const { return __my_value_; }
-  const_reference operator[](difference_type) const { return __my_value_; }
-
-  difference_type operator-(const constant_iterator &__it) const {
-    return __my_counter_ - __it.__my_counter_;
-  }
-
-  constant_iterator &operator+=(difference_type __forward) {
-    __my_counter_ += __forward;
-    return *this;
-  }
-  constant_iterator &operator-=(difference_type __backward) {
-    return *this += -__backward;
-  }
-  constant_iterator &operator++() { return *this += 1; }
-  constant_iterator &operator--() { return *this -= 1; }
-
-  constant_iterator operator++(int) {
-    constant_iterator __it(*this);
-    ++(*this);
-    return __it;
-  }
-  constant_iterator operator--(int) {
-    constant_iterator __it(*this);
-    --(*this);
-    return __it;
-  }
-
-  constant_iterator operator-(difference_type __backward) const {
-    return constant_iterator(__my_value_, __my_counter_ - __backward);
-  }
-  constant_iterator operator+(difference_type __forward) const {
-    return constant_iterator(__my_value_, __my_counter_ + __forward);
-  }
-  friend constant_iterator operator+(difference_type __forward,
-                                     const constant_iterator __it) {
-    return __it + __forward;
-  }
-
-  bool operator==(const constant_iterator &__it) const {
-    return __my_value_ == __it.__my_value_ &&
-           this->__my_counter_ == __it.__my_counter_;
-  }
-  bool operator!=(const constant_iterator &__it) const {
-    return !(*this == __it);
-  }
-  bool operator<(const constant_iterator &__it) const {
-    return *this - __it < 0;
-  }
-  bool operator>(const constant_iterator &__it) const { return __it < *this; }
-  bool operator<=(const constant_iterator &__it) const {
-    return !(*this > __it);
-  }
-  bool operator>=(const constant_iterator &__it) const {
-    return !(*this < __it);
-  }
-
-private:
-  _Tp __my_value_;
-  uint64_t __my_counter_;
-};
-
-template <typename _Tp>
-constant_iterator<_Tp> make_constant_iterator(_Tp __value) {
-  return constant_iterator<_Tp>(__value);
-}
-
-// key_value_pair class to represent a key and value, specifically a
-// dereferenced arg_index_input_iterator
-template <typename _KeyTp, typename _ValueTp> class key_value_pair {
-public:
-  key_value_pair() = default;
-
-  key_value_pair(const _KeyTp &_key, const _ValueTp &_value)
-      : key(_key), value(_value) {}
-
-  bool operator==(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
-    return (key == _kvp.key) && (value == _kvp.value);
-  }
-
-  bool operator!=(const key_value_pair<_KeyTp, _ValueTp> &_kvp) const {
-    return (key != _kvp.key) || (value != _kvp.value);
-  }
-
-  _KeyTp key;
-  _ValueTp value;
-};
-
-namespace detail {
-
-template <typename KeyTp, typename _ValueTp> struct make_key_value_pair {
-  template <typename ValRefTp>
-  key_value_pair<KeyTp, _ValueTp>
-  operator()(const oneapi::dpl::__internal::tuple<KeyTp, ValRefTp> &tup) const {
-    return ::dpct::key_value_pair<KeyTp, _ValueTp>(::std::get<0>(tup),
-                                                   ::std::get<1>(tup));
-  }
-};
-
-template <class T> struct __zip_iterator_impl;
-template <class... Ts> struct __zip_iterator_impl<std::tuple<Ts...>> {
-  using type = oneapi::dpl::zip_iterator<Ts...>;
-};
-
-} // end namespace detail
-
-// dpct::zip_iterator can only accept std::tuple type as template argument for
-// compatibility purpose. Please use oneapi::dpl::zip_iterator if you want to
-// pass iterator's types directly.
-template <typename... Ts>
-using zip_iterator = typename detail::__zip_iterator_impl<Ts...>::type;
-
-// arg_index_input_iterator is an iterator over a input iterator, with a index.
-// When dereferenced, it returns a key_value_pair, which can be interrogated for
-// the index key or the value from the input iterator
-template <typename InputIteratorT, typename OffsetT = ptrdiff_t,
-          typename OutputValueT =
-              typename ::std::iterator_traits<InputIteratorT>::value_type>
-class arg_index_input_iterator
-    : public oneapi::dpl::transform_iterator<
-          oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
-                                    InputIteratorT>,
-          detail::make_key_value_pair<OffsetT, OutputValueT>> {
-  using arg_index_input_iterator_wrap = oneapi::dpl::transform_iterator<
-      oneapi::dpl::zip_iterator<oneapi::dpl::counting_iterator<OffsetT>,
-                                InputIteratorT>,
-      detail::make_key_value_pair<OffsetT, OutputValueT>>;
-
-public:
-  typedef OffsetT difference_type;
-
-  // signal to __get_sycl_range that this iterator is as a direct pass iterator
-  using is_zip = ::std::true_type;
-
-  arg_index_input_iterator(const arg_index_input_iterator_wrap &__arg_wrap)
-      : arg_index_input_iterator_wrap(__arg_wrap) {}
-  arg_index_input_iterator(InputIteratorT __iter)
-      : arg_index_input_iterator_wrap(
-            oneapi::dpl::make_zip_iterator(
-                oneapi::dpl::counting_iterator(OffsetT(0)), __iter),
-            detail::make_key_value_pair<OffsetT, OutputValueT>()) {}
-
-  arg_index_input_iterator &operator=(const arg_index_input_iterator &__input) {
-    arg_index_input_iterator_wrap::operator=(__input);
-    return *this;
-  }
-  arg_index_input_iterator &operator++() {
-    arg_index_input_iterator_wrap::operator++();
-    return *this;
-  }
-  arg_index_input_iterator &operator--() {
-    arg_index_input_iterator_wrap::operator--();
-    return *this;
-  }
-  arg_index_input_iterator operator++(int) {
-    arg_index_input_iterator __it(*this);
-    ++(*this);
-    return __it;
-  }
-  arg_index_input_iterator operator--(int) {
-    arg_index_input_iterator __it(*this);
-    --(*this);
-    return __it;
-  }
-  arg_index_input_iterator operator+(difference_type __forward) const {
-    return arg_index_input_iterator(
-        arg_index_input_iterator_wrap::operator+(__forward));
-  }
-  arg_index_input_iterator operator-(difference_type __backward) const {
-    return arg_index_input_iterator(
-        arg_index_input_iterator_wrap::operator-(__backward));
-  }
-  arg_index_input_iterator &operator+=(difference_type __forward) {
-    arg_index_input_iterator_wrap::operator+=(__forward);
-    return *this;
-  }
-  arg_index_input_iterator &operator-=(difference_type __backward) {
-    arg_index_input_iterator_wrap::operator-=(__backward);
-    return *this;
-  }
-
-  friend arg_index_input_iterator
-  operator+(difference_type __forward, const arg_index_input_iterator &__it) {
-    return __it + __forward;
-  }
-
-  difference_type operator-(const arg_index_input_iterator &__it) const {
-    return arg_index_input_iterator_wrap::operator-(__it);
-  }
-  bool operator==(const arg_index_input_iterator &__it) const {
-    return arg_index_input_iterator_wrap::operator==(__it);
-  }
-  bool operator!=(const arg_index_input_iterator &__it) const {
-    return !(*this == __it);
-  }
-  bool operator<(const arg_index_input_iterator &__it) const {
-    return *this - __it < 0;
-  }
-  bool operator>(const arg_index_input_iterator &__it) const {
-    return __it < *this;
-  }
-  bool operator<=(const arg_index_input_iterator &__it) const {
-    return !(*this > __it);
-  }
-  bool operator>=(const arg_index_input_iterator &__it) const {
-    return !(*this < __it);
-  }
-
-  // returns an arg_index_input_iterator with the same iter position, but a
-  // count reset to 0
-  arg_index_input_iterator create_normalized() {
-    return arg_index_input_iterator(
-        ::std::get<1>(arg_index_input_iterator_wrap::base().base()));
-  }
-};
-
-template <typename IterT> struct io_iterator_pair {
-  inline io_iterator_pair() : selector(false) {}
-
-  inline io_iterator_pair(const IterT &first, const IterT &second)
-      : selector(false) {
-    iter[0] = first;
-    iter[1] = second;
-  }
-
-  inline IterT first() const { return selector ? iter[1] : iter[0]; }
-
-  inline IterT second() const { return selector ? iter[0] : iter[1]; }
-
-  inline void swap() { selector = !selector; }
-
-  bool selector;
-
-  IterT iter[2];
-};
-
-template <typename _Iter, typename _UnaryFunc>
-auto make_transform_output_iterator(_Iter __it, _UnaryFunc __unary_func) {
-  return oneapi::dpl::transform_iterator(
-      __it, internal::_Unary_Out<_UnaryFunc>(__unary_func));
-}
-
-} // end namespace dpct
-
-#endif
diff --git a/dpct/dpl_extras/memory.h b/dpct/dpl_extras/memory.h
deleted file mode 100644
index 08b965133f519..0000000000000
--- a/dpct/dpl_extras/memory.h
+++ /dev/null
@@ -1,1024 +0,0 @@
-//==---- memory.h ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_MEMORY_H__
-#define __DPCT_MEMORY_H__
-
-#include <sycl/sycl.hpp>
-#include <oneapi/dpl/memory>
-#include "functional.h"
-
-// Memory management section:
-// device_pointer, device_reference, swap, device_iterator, malloc_device,
-// device_new, free_device, device_delete
-namespace dpct {
-namespace detail {
-template <typename T>
-struct make_allocatable
-{
-  using type = T;
-};
-
-template <>
-struct make_allocatable<void>
-{
-  using type = dpct::byte_t;
-};
-
-#if defined(__LIBSYCL_MAJOR_VERSION) && defined(__LIBSYCL_MINOR_VERSION) &&    \
-    defined(__LIBSYCL_PATCH_VERSION)
-#define _DPCT_LIBSYCL_VERSION                                                  \
-  (__LIBSYCL_MAJOR_VERSION * 10000 + __LIBSYCL_MINOR_VERSION * 100 +           \
-   __LIBSYCL_PATCH_VERSION)
-#else
-#define _DPCT_LIBSYCL_VERSION 0
-#endif
-
-template <typename _DataT>
-using __buffer_allocator =
-#if _DPCT_LIBSYCL_VERSION >= 60000
-    sycl::buffer_allocator<typename make_allocatable<_DataT>::type>;
-#else
-    sycl::buffer_allocator;
-#endif
-} // namespace detail
-
-#ifdef DPCT_USM_LEVEL_NONE
-template <typename T, sycl::access_mode Mode = sycl::access_mode::read_write,
-          typename Allocator = detail::__buffer_allocator<T>>
-class device_pointer;
-#else
-template <typename T> class device_pointer;
-#endif
-
-template <typename T> struct device_reference {
-  using pointer = device_pointer<T>;
-  using value_type = T;
-  template <typename OtherT>
-  device_reference(const device_reference<OtherT> &input)
-      : value(input.value) {}
-  device_reference(const pointer &input) : value((*input).value) {}
-  device_reference(value_type &input) : value(input) {}
-  template <typename OtherT>
-  device_reference &operator=(const device_reference<OtherT> &input) {
-    value = input;
-    return *this;
-  };
-  device_reference &operator=(const device_reference &input) {
-    T val = input.value;
-    value = val;
-    return *this;
-  };
-  device_reference &operator=(const value_type &x) {
-    value = x;
-    return *this;
-  };
-  pointer operator&() const { return pointer(&value); };
-  operator value_type() const { return T(value); }
-  device_reference &operator++() {
-    ++value;
-    return *this;
-  };
-  device_reference &operator--() {
-    --value;
-    return *this;
-  };
-  device_reference operator++(int) {
-    device_reference ref(*this);
-    ++(*this);
-    return ref;
-  };
-  device_reference operator--(int) {
-    device_reference ref(*this);
-    --(*this);
-    return ref;
-  };
-  device_reference &operator+=(const T &input) {
-    value += input;
-    return *this;
-  };
-  device_reference &operator-=(const T &input) {
-    value -= input;
-    return *this;
-  };
-  device_reference &operator*=(const T &input) {
-    value *= input;
-    return *this;
-  };
-  device_reference &operator/=(const T &input) {
-    value /= input;
-    return *this;
-  };
-  device_reference &operator%=(const T &input) {
-    value %= input;
-    return *this;
-  };
-  device_reference &operator&=(const T &input) {
-    value &= input;
-    return *this;
-  };
-  device_reference &operator|=(const T &input) {
-    value |= input;
-    return *this;
-  };
-  device_reference &operator^=(const T &input) {
-    value ^= input;
-    return *this;
-  };
-  device_reference &operator<<=(const T &input) {
-    value <<= input;
-    return *this;
-  };
-  device_reference &operator>>=(const T &input) {
-    value >>= input;
-    return *this;
-  };
-  void swap(device_reference &input) {
-    T tmp = (*this);
-    *this = (input);
-    input = (tmp);
-  }
-  T &value;
-};
-
-template <typename T>
-void swap(device_reference<T> &x, device_reference<T> &y) {
-  x.swap(y);
-}
-
-template <typename T> void swap(T &x, T &y) {
-  T tmp = x;
-  x = y;
-  y = tmp;
-}
-
-template <typename T>
-::std::ostream &operator<<(::std::ostream &out,
-                           const device_reference<T> &ref) {
-  return out << T(ref);
-}
-
-namespace internal {
-// struct for checking if iterator is heterogeneous or not
-template <typename Iter,
-          typename Void = void> // for non-heterogeneous iterators
-struct is_hetero_iterator : std::false_type {};
-
-template <typename Iter> // for heterogeneous iterators
-struct is_hetero_iterator<
-    Iter, typename std::enable_if<Iter::is_hetero::value, void>::type>
-    : std::true_type {};
-} // namespace internal
-
-#ifdef DPCT_USM_LEVEL_NONE
-// Must be forward declared due to default argument
-template <typename T>
-device_pointer<T> device_new(device_pointer<void>, const T &,
-                             const std::size_t = 1);
-
-template <typename T, sycl::access_mode Mode, typename Allocator>
-class device_iterator;
-
-template <typename ValueType, typename Allocator, typename Derived>
-class device_pointer_base {
-protected:
-  sycl::buffer<ValueType, 1, Allocator> buffer;
-  std::size_t idx;
-
-  // Declare friend to give access to protected buffer and idx members
-  template <typename T>
-  friend device_pointer<T> device_new(device_pointer<void>, const T &,
-                                      const std::size_t);
-
-public:
-  using pointer = ValueType *;
-  using difference_type = std::make_signed<std::size_t>::type;
-
-  device_pointer_base(sycl::buffer<ValueType, 1> in, std::size_t i = 0)
-      : buffer(in), idx(i) {}
-#ifdef __USE_DPCT
-  template <typename OtherT>
-  device_pointer_base(OtherT *ptr)
-      : buffer(
-            dpct::detail::mem_mgr::instance()
-                .translate_ptr(ptr)
-                .buffer.template reinterpret<ValueType, 1>(sycl::range<1>(
-                    dpct::detail::mem_mgr::instance().translate_ptr(ptr).size /
-                    sizeof(ValueType)))),
-        idx(ptr - (ValueType*)dpct::detail::mem_mgr::instance()
-                .translate_ptr(ptr).alloc_ptr) {}
-#endif
-  device_pointer_base(const std::size_t count)
-      : buffer(sycl::range<1>(count / sizeof(ValueType))), idx() {}
-  // buffer has no default ctor we pass zero-range to create an empty buffer
-  device_pointer_base() : buffer(sycl::range<1>(0)) {}
-  device_pointer_base(const device_pointer_base &in)
-      : buffer(in.buffer), idx(in.idx) {}
-  pointer get() const {
-    auto res =
-        (const_cast<device_pointer_base *>(this)
-             ->buffer.template get_access<sycl::access_mode::read_write>())
-            .get_pointer();
-    return res + idx;
-  }
-  operator ValueType *() {
-    auto res = (buffer.template get_access<sycl::access_mode::read_write>())
-                   .get_pointer();
-    return res + idx;
-  }
-  operator ValueType *() const {
-    auto res =
-        (const_cast<device_pointer_base *>(this)
-             ->buffer.template get_access<sycl::access_mode::read_write>())
-            .get_pointer();
-    return res + idx;
-  }
-  Derived operator+(difference_type forward) const {
-    return Derived{buffer, idx + forward};
-  }
-  Derived operator-(difference_type backward) const {
-    return Derived{buffer, idx - backward};
-  }
-  Derived operator++(int) {
-    Derived p(buffer, idx);
-    idx += 1;
-    return p;
-  }
-  Derived operator--(int) {
-    Derived p(buffer, idx);
-    idx -= 1;
-    return p;
-  }
-  difference_type operator-(const Derived &it) const { return idx - it.idx; }
-  template <typename OtherIterator>
-  typename std::enable_if<internal::is_hetero_iterator<OtherIterator>::value,
-                          difference_type>::type
-  operator-(const OtherIterator &it) const {
-    return idx - std::distance(oneapi::dpl::begin(buffer), it);
-  }
-
-  std::size_t get_idx() const { return idx; } // required
-
-  sycl::buffer<ValueType, 1, Allocator> get_buffer() {
-    return buffer;
-  } // required
-};
-
-template <sycl::access_mode Mode, typename Allocator>
-class device_pointer<void, Mode, Allocator>
-    : public device_pointer_base<dpct::byte_t, Allocator,
-                                 device_pointer<void, Mode, Allocator>> {
-private:
-  using base_type =
-      device_pointer_base<dpct::byte_t, Allocator, device_pointer>;
-
-public:
-  using value_type = dpct::byte_t;
-  using difference_type = std::make_signed<std::size_t>::type;
-  using pointer = void *;
-  using reference = value_type &;
-  using iterator_category = std::random_access_iterator_tag;
-  using is_hetero = std::true_type; // required
-  using is_passed_directly = std::false_type;
-  static constexpr sycl::access_mode mode = Mode; // required
-
-  device_pointer(sycl::buffer<value_type, 1> in, std::size_t i = 0)
-      : base_type(in, i) {}
-#ifdef __USE_DPCT
-  template <typename OtherT> device_pointer(OtherT *ptr) : base_type(ptr) {}
-#endif
-  // needed for malloc_device, count is number of bytes to allocate
-  device_pointer(const std::size_t count) : base_type(count) {}
-  device_pointer() : base_type() {}
-  device_pointer(const device_pointer &in) : base_type(in) {}
-  device_pointer &operator+=(difference_type forward) {
-    this->idx += forward;
-    return *this;
-  }
-  device_pointer &operator-=(difference_type backward) {
-    this->idx -= backward;
-    return *this;
-  }
-  // include operators from base class
-  using base_type::operator++;
-  using base_type::operator--;
-  device_pointer &operator++() {
-    this->idx += 1;
-    return *this;
-  }
-  device_pointer &operator--() {
-    this->idx -= 1;
-    return *this;
-  }
-};
-
-template <typename T, sycl::access_mode Mode, typename Allocator>
-class device_pointer
-    : public device_pointer_base<T, Allocator,
-                                 device_pointer<T, Mode, Allocator>> {
-private:
-  using base_type = device_pointer_base<T, Allocator, device_pointer>;
-
-public:
-  using value_type = T;
-  using difference_type = std::make_signed<std::size_t>::type;
-  using pointer = T *;
-  using reference = T &;
-  using iterator_category = std::random_access_iterator_tag;
-  using is_hetero = std::true_type; // required
-  using is_passed_directly = std::false_type;
-  static constexpr sycl::access_mode mode = Mode; // required
-
-  device_pointer(sycl::buffer<T, 1> in, std::size_t i = 0) : base_type(in, i) {}
-#ifdef __USE_DPCT
-  template <typename OtherT> device_pointer(OtherT *ptr) : base_type(ptr) {}
-#endif
-  // needed for malloc_device, count is number of bytes to allocate
-  device_pointer(const std::size_t count) : base_type(count) {}
-  device_pointer() : base_type() {}
-  device_pointer(const device_pointer &in) : base_type(in) {}
-  device_pointer &operator+=(difference_type forward) {
-    this->idx += forward;
-    return *this;
-  }
-  device_pointer &operator-=(difference_type backward) {
-    this->idx -= backward;
-    return *this;
-  }
-  operator device_pointer<void>() {
-    auto converted_buf = (this->buffer)
-                             .template reinterpret<dpct::byte_t>(sycl::range<1>(
-                                 sizeof(value_type) * this->buffer.size()));
-    return device_pointer<void>(converted_buf, this->idx);
-  }
-  // include operators from base class
-  using base_type::operator++;
-  using base_type::operator--;
-  device_pointer &operator++() {
-    this->idx += 1;
-    return *this;
-  }
-  device_pointer &operator--() {
-    this->idx -= 1;
-    return *this;
-  }
-};
-#else
-template <typename T> class device_iterator;
-
-template <typename ValueType, typename Derived> class device_pointer_base {
-protected:
-  ValueType *ptr;
-
-public:
-  using pointer = ValueType *;
-  using difference_type = std::make_signed<std::size_t>::type;
-
-  device_pointer_base(ValueType *p) : ptr(p) {}
-  device_pointer_base(const std::size_t count) {
-    sycl::queue default_queue = dpct::get_default_queue();
-    ptr = static_cast<ValueType *>(sycl::malloc_shared(
-        count, default_queue.get_device(), default_queue.get_context()));
-  }
-  device_pointer_base() {}
-  pointer get() const { return ptr; }
-  operator ValueType *() { return ptr; }
-  operator ValueType *() const { return ptr; }
-
-  ValueType &operator[](difference_type idx) { return ptr[idx]; }
-  ValueType &operator[](difference_type idx) const { return ptr[idx]; }
-
-  Derived operator+(difference_type forward) const {
-    return Derived{ptr + forward};
-  }
-  Derived operator-(difference_type backward) const {
-    return Derived{ptr - backward};
-  }
-  Derived operator++(int) {
-    Derived p(ptr);
-    ++ptr;
-    return p;
-  }
-  Derived operator--(int) {
-    Derived p(ptr);
-    --ptr;
-    return p;
-  }
-  difference_type operator-(const Derived &it) const { return ptr - it.ptr; }
-};
-
-template <>
-class device_pointer<void>
-    : public device_pointer_base<dpct::byte_t, device_pointer<void>> {
-private:
-  using base_type = device_pointer_base<dpct::byte_t, device_pointer<void>>;
-
-public:
-  using value_type = dpct::byte_t;
-  using difference_type = std::make_signed<std::size_t>::type;
-  using pointer = void *;
-  using reference = value_type &;
-  using const_reference = const value_type &;
-  using iterator_category = std::random_access_iterator_tag;
-  using is_hetero = std::false_type;         // required
-  using is_passed_directly = std::true_type; // required
-
-  device_pointer(void *p) : base_type(static_cast<value_type *>(p)) {}
-  // needed for malloc_device, count is number of bytes to allocate
-  device_pointer(const std::size_t count) : base_type(count) {}
-  device_pointer() : base_type() {}
-  pointer get() const { return static_cast<pointer>(this->ptr); }
-  operator void *() { return this->ptr; }
-  operator void *() const { return this->ptr; }
-
-  // include operators from base class
-  using base_type::operator++;
-  using base_type::operator--;
-  device_pointer &operator++() {
-    ++(this->ptr);
-    return *this;
-  }
-  device_pointer &operator--() {
-    --(this->ptr);
-    return *this;
-  }
-  device_pointer &operator+=(difference_type forward) {
-    this->ptr = this->ptr + forward;
-    return *this;
-  }
-  device_pointer &operator-=(difference_type backward) {
-    this->ptr = this->ptr - backward;
-    return *this;
-  }
-};
-
-template <typename T>
-class device_pointer : public device_pointer_base<T, device_pointer<T>> {
-private:
-  using base_type = device_pointer_base<T, device_pointer<T>>;
-
-public:
-  using value_type = T;
-  using difference_type = std::make_signed<std::size_t>::type;
-  using pointer = T *;
-  using reference = T &;
-  using const_reference = const T &;
-  using iterator_category = std::random_access_iterator_tag;
-  using is_hetero = std::false_type;         // required
-  using is_passed_directly = std::true_type; // required
-
-  device_pointer(T *p) : base_type(p) {}
-  // needed for malloc_device, count is number of bytes to allocate
-  device_pointer(const std::size_t count) : base_type(count) {}
-  device_pointer() : base_type() {}
-  device_pointer &operator=(const device_iterator<T> &in) {
-    this->ptr = static_cast<device_pointer<T>>(in).ptr;
-    return *this;
-  }
-  operator device_pointer<void>() {
-    return device_pointer<void>(static_cast<void *>(this->ptr));
-  }
-  // include operators from base class
-  using base_type::operator++;
-  using base_type::operator--;
-  device_pointer &operator++() {
-    ++(this->ptr);
-    return *this;
-  }
-  device_pointer &operator--() {
-    --(this->ptr);
-    return *this;
-  }
-  device_pointer &operator+=(difference_type forward) {
-    this->ptr = this->ptr + forward;
-    return *this;
-  }
-  device_pointer &operator-=(difference_type backward) {
-    this->ptr = this->ptr - backward;
-    return *this;
-  }
-};
-#endif
-
-#ifdef DPCT_USM_LEVEL_NONE
-template <typename T, sycl::access_mode Mode = sycl::access_mode::read_write,
-          typename Allocator = detail::__buffer_allocator<T>>
-class device_iterator : public device_pointer<T, Mode, Allocator> {
-  using Base = device_pointer<T, Mode, Allocator>;
-
-public:
-  using value_type = T;
-  using difference_type = std::make_signed<std::size_t>::type;
-  using pointer = T *;
-  using reference = T &;
-  using iterator_category = std::random_access_iterator_tag;
-  using is_hetero = std::true_type;                // required
-  using is_passed_directly = std::false_type;      // required
-  static constexpr sycl::access_mode mode = Mode; // required
-
-  device_iterator() : Base() {}
-  device_iterator(sycl::buffer<T, 1, Allocator> vec, std::size_t index)
-      : Base(vec, index) {}
-  device_iterator(const Base &dev_ptr) : Base(dev_ptr) {}
-  template <sycl::access_mode inMode>
-  device_iterator(const device_iterator<T, inMode, Allocator> &in)
-      : Base(in.buffer, in.idx) {} // required for iter_mode
-  device_iterator &operator=(const device_iterator &in) {
-    Base::buffer = in.buffer;
-    Base::idx = in.idx;
-    return *this;
-  }
-
-  reference operator*() const {
-    return const_cast<device_iterator *>(this)
-        ->buffer.template get_access<mode>()[Base::idx];
-  }
-
-  reference operator[](difference_type i) const { return *(*this + i); }
-  device_iterator &operator++() {
-    ++Base::idx;
-    return *this;
-  }
-  device_iterator &operator--() {
-    --Base::idx;
-    return *this;
-  }
-  device_iterator operator++(int) {
-    device_iterator it(*this);
-    ++(*this);
-    return it;
-  }
-  device_iterator operator--(int) {
-    device_iterator it(*this);
-    --(*this);
-    return it;
-  }
-  device_iterator operator+(difference_type forward) const {
-    const auto new_idx = Base::idx + forward;
-    return {Base::buffer, new_idx};
-  }
-  device_iterator &operator+=(difference_type forward) {
-    Base::idx += forward;
-    return *this;
-  }
-  device_iterator operator-(difference_type backward) const {
-    return {Base::buffer, Base::idx - backward};
-  }
-  device_iterator &operator-=(difference_type backward) {
-    Base::idx -= backward;
-    return *this;
-  }
-  friend device_iterator operator+(difference_type forward,
-                                   const device_iterator &it) {
-    return it + forward;
-  }
-  difference_type operator-(const device_iterator &it) const {
-    return Base::idx - it.idx;
-  }
-  template <typename OtherIterator>
-  typename std::enable_if<internal::is_hetero_iterator<OtherIterator>::value,
-                          difference_type>::type
-  operator-(const OtherIterator &it) const {
-    return Base::idx - std::distance(oneapi::dpl::begin(Base::buffer), it);
-  }
-  bool operator==(const device_iterator &it) const { return *this - it == 0; }
-  bool operator!=(const device_iterator &it) const { return !(*this == it); }
-  bool operator<(const device_iterator &it) const { return *this - it < 0; }
-  bool operator>(const device_iterator &it) const { return it < *this; }
-  bool operator<=(const device_iterator &it) const { return !(*this > it); }
-  bool operator>=(const device_iterator &it) const { return !(*this < it); }
-
-  std::size_t get_idx() const { return Base::idx; } // required
-
-  sycl::buffer<T, 1, Allocator> get_buffer() {
-    return Base::buffer;
-  } // required
-};
-#else
-template <typename T> class device_iterator : public device_pointer<T> {
-  using Base = device_pointer<T>;
-
-protected:
-  std::size_t idx;
-
-public:
-  using value_type = T;
-  using difference_type = std::make_signed<std::size_t>::type;
-  using pointer = typename Base::pointer;
-  using reference = typename Base::reference;
-  using iterator_category = std::random_access_iterator_tag;
-  using is_hetero = std::false_type;         // required
-  using is_passed_directly = std::true_type; // required
-  static constexpr sycl::access_mode mode =
-      sycl::access_mode::read_write; // required
-
-  device_iterator() : Base(nullptr), idx(0) {}
-  device_iterator(T *vec, std::size_t index) : Base(vec), idx(index) {}
-  device_iterator(const Base &dev_ptr) : Base(dev_ptr), idx(0) {}
-  template <sycl::access_mode inMode>
-  device_iterator(const device_iterator<T> &in)
-      : Base(in.ptr), idx(in.idx) {} // required for iter_mode
-  device_iterator &operator=(const device_iterator &in) {
-    Base::operator=(in);
-    idx = in.idx;
-    return *this;
-  }
-
-  reference operator*() const { return *(Base::ptr + idx); }
-
-  reference operator[](difference_type i) { return Base::ptr[idx + i]; }
-  reference operator[](difference_type i) const { return Base::ptr[idx + i]; }
-  device_iterator &operator++() {
-    ++idx;
-    return *this;
-  }
-  device_iterator &operator--() {
-    --idx;
-    return *this;
-  }
-  device_iterator operator++(int) {
-    device_iterator it(*this);
-    ++(*this);
-    return it;
-  }
-  device_iterator operator--(int) {
-    device_iterator it(*this);
-    --(*this);
-    return it;
-  }
-  device_iterator operator+(difference_type forward) const {
-    const auto new_idx = idx + forward;
-    return {Base::ptr, new_idx};
-  }
-  device_iterator &operator+=(difference_type forward) {
-    idx += forward;
-    return *this;
-  }
-  device_iterator operator-(difference_type backward) const {
-    return {Base::ptr, idx - backward};
-  }
-  device_iterator &operator-=(difference_type backward) {
-    idx -= backward;
-    return *this;
-  }
-  friend device_iterator operator+(difference_type forward,
-                                   const device_iterator &it) {
-    return it + forward;
-  }
-  difference_type operator-(const device_iterator &it) const {
-    return idx - it.idx;
-  }
-
-  template <typename OtherIterator>
-  typename std::enable_if<internal::is_hetero_iterator<OtherIterator>::value,
-                          difference_type>::type
-  operator-(const OtherIterator &it) const {
-    return idx - it.get_idx();
-  }
-
-  bool operator==(const device_iterator &it) const { return *this - it == 0; }
-  bool operator!=(const device_iterator &it) const { return !(*this == it); }
-  bool operator<(const device_iterator &it) const { return *this - it < 0; }
-  bool operator>(const device_iterator &it) const { return it < *this; }
-  bool operator<=(const device_iterator &it) const { return !(*this > it); }
-  bool operator>=(const device_iterator &it) const { return !(*this < it); }
-
-  std::size_t get_idx() const { return idx; } // required
-
-  device_iterator &get_buffer() { return *this; } // required
-
-  std::size_t size() const { return idx; }
-};
-#endif
-
-struct sys_tag {};
-struct device_sys_tag : public sys_tag {};
-struct host_sys_tag : public sys_tag {};
-
-#ifdef DPCT_USM_LEVEL_NONE
-template <typename T, typename Tag> class tagged_pointer {
-  static_assert(false,
-                "tagged_pointer is not supported with DPCT_USM_LEVEL_NONE");
-};
-template <typename PolicyOrTag, typename Pointer>
-void release_temporary_allocation(PolicyOrTag &&policy_or_tag, Pointer ptr) {
-  static_assert(
-      false,
-      "release_temporary_allocation is not supported with DPCT_USM_LEVEL_NONE");
-}
-template <typename T, typename PolicyOrTag, typename SizeType>
-auto get_temporary_allocation(PolicyOrTag &&policy_or_tag,
-                              SizeType num_elements) {
-  static_assert(
-      false,
-      "get_temporary_allocation is not supported with DPCT_USM_LEVEL_NONE");
-}
-template <typename PolicyOrTag>
-auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_bytes) {
-  static_assert(false, "malloc is not supported with DPCT_USM_LEVEL_NONE");
-}
-template <typename T, typename PolicyOrTag>
-auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_elements) {
-  static_assert(false, "malloc<T> is not supported with DPCT_USM_LEVEL_NONE");
-}
-template <typename PolicyOrTag, typename Pointer>
-void free(PolicyOrTag &&policy_or_tag, Pointer ptr) {
-  static_assert(false, "free is not supported with DPCT_USM_LEVEL_NONE");
-}
-#else
-namespace internal {
-
-// Utility that converts a policy to a tag or reflects a provided tag
-template <typename PolicyOrTag> struct policy_or_tag_to_tag {
-private:
-  using decayed_policy_or_tag_t = ::std::decay_t<PolicyOrTag>;
-  using policy_conversion = ::std::conditional_t<
-      !is_hetero_execution_policy<decayed_policy_or_tag_t>::value, host_sys_tag,
-      device_sys_tag>;
-  static constexpr bool is_policy_v =
-      oneapi::dpl::execution::is_execution_policy_v<decayed_policy_or_tag_t>;
-  static constexpr bool is_sys_tag_v = ::std::disjunction_v<
-      ::std::is_same<decayed_policy_or_tag_t, host_sys_tag>,
-      ::std::is_same<decayed_policy_or_tag_t, device_sys_tag>>;
-  static_assert(is_policy_v || is_sys_tag_v,
-                "Only oneDPL policies or system tags may be provided");
-
-public:
-  using type = ::std::conditional_t<is_policy_v, policy_conversion,
-                                    decayed_policy_or_tag_t>;
-};
-
-template <typename PolicyOrTag>
-using policy_or_tag_to_tag_t = typename policy_or_tag_to_tag<PolicyOrTag>::type;
-
-template <typename PolicyOrTag> struct is_host_policy_or_tag {
-private:
-  using tag_t = policy_or_tag_to_tag_t<PolicyOrTag>;
-
-public:
-  static constexpr bool value = ::std::is_same_v<tag_t, host_sys_tag>;
-};
-
-template <typename PolicyOrTag>
-inline constexpr bool is_host_policy_or_tag_v =
-    is_host_policy_or_tag<PolicyOrTag>::value;
-
-} // namespace internal
-
-// TODO: Make this class an iterator adaptor.
-// tagged_pointer provides a wrapper around a raw pointer type with a tag of the
-// location of the allocated memory. Standard pointer operations are supported
-// with this class.
-template <typename T, typename Tag> class tagged_pointer {
-public:
-  using value_type = T;
-  using difference_type = ::std::ptrdiff_t;
-  using pointer = T *;
-  using reference = T &;
-  using iterator_category = std::random_access_iterator_tag;
-  using is_hetero = ::std::false_type;
-  using is_passed_directly = std::true_type;
-
-  tagged_pointer() : m_ptr(nullptr) {}
-  tagged_pointer(T *ptr) : m_ptr(ptr) {}
-  T &operator[](difference_type idx) { return this->m_ptr[idx]; }
-  const T &operator[](difference_type idx) const { return this->m_ptr[idx]; }
-  tagged_pointer operator+(difference_type forward) const {
-    return tagged_pointer{this->m_ptr + forward};
-  }
-  tagged_pointer operator-(difference_type backward) const {
-    return tagged_pointer{this->m_ptr - backward};
-  }
-  operator const T *() const { return m_ptr; }
-  operator T *() { return m_ptr; }
-  T &operator*() { return *this->m_ptr; }
-  const T &operator*() const { return *this->m_ptr; }
-  T *operator->() { return this->m_ptr; }
-  const T *operator->() const { return this->m_ptr; }
-  tagged_pointer operator++(int) {
-    tagged_pointer p(this->m_ptr);
-    ++this->m_ptr;
-    return p;
-  }
-  tagged_pointer operator--(int) {
-    tagged_pointer p(this->m_ptr);
-    --this->m_ptr;
-    return p;
-  }
-  tagged_pointer &operator++() {
-    ++this->m_ptr;
-    return *this;
-  }
-  tagged_pointer &operator--() {
-    --this->m_ptr;
-    return *this;
-  }
-  difference_type operator-(const tagged_pointer &it) const {
-    return this->m_ptr - it.m_ptr;
-  }
-  tagged_pointer &operator+=(difference_type forward) {
-    this->m_ptr = this->m_ptr + forward;
-    return *this;
-  }
-  tagged_pointer &operator-=(difference_type backward) {
-    this->m_ptr = this->m_ptr - backward;
-    return *this;
-  }
-
-private:
-  T *m_ptr;
-};
-
-// Void specialization for tagged pointers. Iterator traits are not provided but
-// conversion to other non-void tagged pointers is allowed. Pointer arithmetic
-// is disallowed with this specialization.
-template <typename Tag> class tagged_pointer<void, Tag> {
-public:
-  using difference_type = ::std::ptrdiff_t;
-  using pointer = void *;
-  tagged_pointer() : m_ptr(nullptr) {}
-  tagged_pointer(pointer ptr) : m_ptr(ptr) {}
-  operator const void *() const { return m_ptr; }
-  operator void *() { return m_ptr; }
-  // Enable tagged void pointer to convert to all other raw pointer types.
-  template <typename OtherPtr> operator OtherPtr *() const {
-    return static_cast<OtherPtr *>(this->m_ptr);
-  }
-
-private:
-  void *m_ptr;
-};
-
-namespace internal {
-
-// Internal utility to return raw pointer to allocated memory. Note that host
-// allocations are not device accessible (not pinned).
-template <typename PolicyOrTag>
-void *malloc_base(PolicyOrTag &&policy_or_tag, const ::std::size_t num_bytes) {
-  using decayed_policy_or_tag_t = ::std::decay_t<PolicyOrTag>;
-  if constexpr (internal::is_host_policy_or_tag_v<PolicyOrTag>) {
-    return ::std::malloc(num_bytes);
-  } else {
-    sycl::queue q;
-    // Grab the associated queue if a device policy is provided. Otherwise, use
-    // default constructed.
-    if constexpr (oneapi::dpl::execution::is_execution_policy_v<
-                      decayed_policy_or_tag_t>) {
-      q = policy_or_tag.queue();
-    } else {
-      q = get_default_queue();
-    }
-    return sycl::malloc_shared(num_bytes, q);
-  }
-}
-
-} // namespace internal
-
-template <typename PolicyOrTag>
-auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_bytes) {
-  return tagged_pointer<void, internal::policy_or_tag_to_tag_t<PolicyOrTag>>(
-      internal::malloc_base(::std::forward<PolicyOrTag>(policy_or_tag),
-                            num_bytes));
-}
-
-template <typename T, typename PolicyOrTag>
-auto malloc(PolicyOrTag &&policy_or_tag, const ::std::size_t num_elements) {
-  return tagged_pointer<T, internal::policy_or_tag_to_tag_t<PolicyOrTag>>(
-      static_cast<T *>(
-          internal::malloc_base(::std::forward<PolicyOrTag>(policy_or_tag),
-                                num_elements * sizeof(T))));
-}
-
-template <typename PolicyOrTag, typename Pointer>
-void free(PolicyOrTag &&policy_or_tag, Pointer ptr) {
-  using decayed_policy_or_tag_t = ::std::decay_t<PolicyOrTag>;
-  if constexpr (internal::is_host_policy_or_tag_v<PolicyOrTag>) {
-    ::std::free(ptr);
-  } else {
-    sycl::queue q;
-    // Grab the associated queue if a device policy is provided. Otherwise, use
-    // default constructed.
-    if constexpr (oneapi::dpl::execution::is_execution_policy_v<
-                      decayed_policy_or_tag_t>) {
-      q = policy_or_tag.queue();
-    } else {
-      q = get_default_queue();
-    }
-    sycl::free(ptr, q);
-  }
-}
-
-template <typename T, typename PolicyOrTag, typename SizeType>
-auto get_temporary_allocation(PolicyOrTag &&policy_or_tag,
-                              SizeType num_elements) {
-  auto allocation_ptr =
-      dpct::malloc<T>(::std::forward<PolicyOrTag>(policy_or_tag), num_elements);
-  if (allocation_ptr == nullptr)
-    return ::std::make_pair(allocation_ptr, SizeType(0));
-  return ::std::make_pair(allocation_ptr, num_elements);
-}
-
-template <typename PolicyOrTag, typename Pointer>
-void release_temporary_allocation(PolicyOrTag &&policy_or_tag, Pointer ptr) {
-  dpct::free(::std::forward<PolicyOrTag>(policy_or_tag), ptr);
-}
-#endif
-
-template <typename T>
-device_pointer<T> malloc_device(const std::size_t num_elements) {
-  return device_pointer<T>(num_elements * sizeof(T));
-}
-static inline device_pointer<void> malloc_device(const std::size_t num_bytes) {
-  return device_pointer<void>(num_bytes);
-}
-#ifdef DPCT_USM_LEVEL_NONE
-template <typename T>
-device_pointer<T> device_new(device_pointer<void> p, const T &value,
-                             const std::size_t count) {
-  auto converted_buf = p.buffer.template reinterpret<T>(sycl::range<1>(count));
-  ::std::uninitialized_fill(
-      oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
-      oneapi::dpl::begin(converted_buf),
-      oneapi::dpl::end(converted_buf), value);
-  return device_pointer<T>(converted_buf, p.idx);
-}
-// buffer manages lifetime
-template <typename T> void free_device(device_pointer<T> ptr) {}
-#else
-template <typename T>
-device_pointer<T> device_new(device_pointer<void> p, const T &value,
-                             const std::size_t count = 1) {
-  dpct::device_pointer<T> converted_p(static_cast<T *>(p.get()));
-  ::std::uninitialized_fill(
-      oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
-      converted_p, converted_p + count, value);
-  return converted_p;
-}
-template <typename T> void free_device(device_pointer<T> ptr) {
-  sycl::free(ptr.get(), dpct::get_default_queue());
-}
-#endif
-template <typename T>
-device_pointer<T> device_new(device_pointer<void> p,
-                             const std::size_t count = 1) {
-  return device_new(p, T{}, count);
-}
-template <typename T>
-device_pointer<T> device_new(const std::size_t count = 1) {
-  return device_new(device_pointer<void>(sizeof(T) * count), T{}, count);
-}
-
-template <typename T>
-typename std::enable_if<!std::is_trivially_destructible<T>::value, void>::type
-device_delete(device_pointer<T> p, const std::size_t count = 1) {
-  ::std::destroy(oneapi::dpl::execution::make_device_policy(dpct::get_default_queue()),
-                 p, p + count);
-  free_device(p);
-}
-template <typename T>
-typename std::enable_if<std::is_trivially_destructible<T>::value, void>::type
-device_delete(device_pointer<T> p, const std::size_t count = 1) {
-  free_device(p);
-}
-
-template <typename T> device_pointer<T> get_device_pointer(T *ptr) {
-  return device_pointer<T>(ptr);
-}
-
-template <typename T>
-device_pointer<T> get_device_pointer(const device_pointer<T> &ptr) {
-  return device_pointer<T>(ptr);
-}
-
-template <typename T> T *get_raw_pointer(const device_pointer<T> &ptr) {
-  return ptr.get();
-}
-
-template <typename Pointer> Pointer get_raw_pointer(const Pointer &ptr) {
-  return ptr;
-}
-
-template <typename T> const T &get_raw_reference(const device_reference<T> &ref) {
-  return ref.value;
-}
-
-template <typename T> T &get_raw_reference(device_reference<T> &ref) {
-  return ref.value;
-}
-
-template <typename T> const T &get_raw_reference(const T &ref) {
-  return ref;
-}
-
-template <typename T> T &get_raw_reference(T &ref) {
-  return ref;
-}
-
-} // namespace dpct
-
-#endif
diff --git a/dpct/dpl_extras/numeric.h b/dpct/dpl_extras/numeric.h
deleted file mode 100644
index 9864cd17359f3..0000000000000
--- a/dpct/dpl_extras/numeric.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//==---- numeric.h --------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_NUMERIC_H__
-#define __DPCT_NUMERIC_H__
-
-namespace dpct {
-
-template <typename Policy, typename InputIt1, typename InputIt2, typename T>
-T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
-                InputIt2 first2, T init) {
-  return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
-                               first2, init);
-}
-
-template <typename Policy, typename InputIt1, typename InputIt2, typename T,
-          typename BinaryOperation1, typename BinaryOperation2>
-T inner_product(Policy &&policy, InputIt1 first1, InputIt1 last1,
-                InputIt2 first2, T init, BinaryOperation1 op1,
-                BinaryOperation2 op2) {
-  return std::transform_reduce(std::forward<Policy>(policy), first1, last1,
-                               first2, init, op1, op2);
-}
-
-} // end namespace dpct
-
-#endif
diff --git a/dpct/dpl_extras/vector.h b/dpct/dpl_extras/vector.h
deleted file mode 100644
index afba575ae1da9..0000000000000
--- a/dpct/dpl_extras/vector.h
+++ /dev/null
@@ -1,752 +0,0 @@
-//==---- vector.h ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_VECTOR_H__
-#define __DPCT_VECTOR_H__
-
-#include <oneapi/dpl/algorithm>
-#include <oneapi/dpl/execution>
-
-#include <sycl/sycl.hpp>
-
-#include "memory.h"
-
-#include <algorithm>
-#include <iterator>
-#include <vector>
-
-#include "../device.hpp"
-
-namespace dpct {
-
-namespace internal {
-template <typename Iter, typename Void = void> // for non-iterators
-struct is_iterator : std::false_type {};
-
-template <typename Iter> // For iterators
-struct is_iterator<
-    Iter,
-    typename std::enable_if<
-        !std::is_void<typename Iter::iterator_category>::value, void>::type>
-    : std::true_type {};
-
-template <typename T> // For pointers
-struct is_iterator<T *> : std::true_type {};
-} // end namespace internal
-
-#ifndef DPCT_USM_LEVEL_NONE
-
-template <typename T,
-          typename Allocator = sycl::usm_allocator<T, sycl::usm::alloc::shared>>
-class device_vector {
-public:
-  using iterator = device_iterator<T>;
-  using const_iterator = const iterator;
-  using reference = device_reference<T>;
-  using const_reference = const reference;
-  using value_type = T;
-  using pointer = T *;
-  using const_pointer = const T *;
-  using difference_type =
-      typename ::std::iterator_traits<iterator>::difference_type;
-  using size_type = ::std::size_t;
-
-private:
-  Allocator _alloc;
-  size_type _size;
-  size_type _capacity;
-  pointer _storage;
-
-  size_type _min_capacity() const { return size_type(1); }
-
-  void _set_capacity_and_alloc() {
-    _capacity = ::std::max(_size * 2, _min_capacity());
-    _storage = _alloc.allocate(_capacity);
-  }
-
-public:
-  template <typename OtherA> operator ::std::vector<T, OtherA>() const {
-    auto __tmp = ::std::vector<T, OtherA>(this->size());
-    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              this->begin(), this->end(), __tmp.begin());
-    return __tmp;
-  }
-  device_vector()
-      : _alloc(get_default_queue()), _size(0), _capacity(_min_capacity()) {
-    _set_capacity_and_alloc();
-  }
-  ~device_vector() /*= default*/ { _alloc.deallocate(_storage, _capacity); };
-  explicit device_vector(size_type n) : device_vector(n, T()) {}
-  explicit device_vector(size_type n, const T &value)
-      : _alloc(get_default_queue()), _size(n) {
-    _set_capacity_and_alloc();
-    if (_size > 0) {
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                begin(), end(), T(value));
-    }
-  }
-  device_vector(const device_vector &other) : _alloc(get_default_queue()) {
-    _size = other.size();
-    _capacity = other.capacity();
-    _storage = _alloc.allocate(_capacity);
-    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              other.begin(), other.end(), begin());
-  }
-  device_vector(device_vector &&other)
-      : _alloc(get_default_queue()), _size(other.size()),
-        _capacity(other.capacity()), _storage(other._storage) {
-    other._size = 0;
-    other._capacity = 0; 
-    other._storage = nullptr;
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename ::std::enable_if<
-                    internal::is_iterator<InputIterator>::value &&
-                        !::std::is_pointer<InputIterator>::value &&
-                        ::std::is_same<typename ::std::iterator_traits<
-                                         InputIterator>::iterator_category,
-                                     ::std::random_access_iterator_tag>::value,
-                    InputIterator>::type last)
-      : _alloc(get_default_queue()) {
-    _size = ::std::distance(first, last);
-    _set_capacity_and_alloc();
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                first, last, begin());
-    }
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename ::std::enable_if<::std::is_pointer<InputIterator>::value,
-                                        InputIterator>::type last)
-      : _alloc(get_default_queue()) {
-    _size = ::std::distance(first, last);
-    _set_capacity_and_alloc();
-    if (_size > 0) {
-      auto ptr_type = sycl::get_pointer_type(first, get_default_context());
-      if (ptr_type != sycl::usm::alloc::host &&
-          ptr_type != sycl::usm::alloc::unknown) {
-        ::std::copy(
-            oneapi::dpl::execution::make_device_policy(get_default_queue()),
-            first, last, begin());
-      } else {
-        sycl::buffer<typename ::std::iterator_traits<InputIterator>::value_type,
-                     1>
-            buf(first, last);
-        auto buf_first = oneapi::dpl::begin(buf);
-        auto buf_last = oneapi::dpl::end(buf);
-        ::std::copy(
-            oneapi::dpl::execution::make_device_policy(get_default_queue()),
-            buf_first, buf_last, begin());
-      }
-    }
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename ::std::enable_if<
-                    internal::is_iterator<InputIterator>::value &&
-                        !::std::is_pointer<InputIterator>::value &&
-                        !::std::is_same<typename ::std::iterator_traits<
-                                          InputIterator>::iterator_category,
-                                      ::std::random_access_iterator_tag>::value,
-                    InputIterator>::type last)
-      : _alloc(get_default_queue()), _size(::std::distance(first, last)) {
-    _set_capacity_and_alloc();
-    ::std::vector<T> _tmp(first, last);
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                _tmp.begin(), _tmp.end(), this->begin());
-    }
-  }
-
-  template <typename OtherAllocator>
-  device_vector(const device_vector<T, OtherAllocator> &v)
-      : _alloc(get_default_queue()), _storage(v.real_begin()), _size(v.size()),
-        _capacity(v.capacity()) {}
-
-  template <typename OtherAllocator>
-  device_vector(::std::vector<T, OtherAllocator> &v)
-      : _alloc(get_default_queue()), _size(v.size()) {
-    _set_capacity_and_alloc();
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                v.begin(), v.end(), this->begin());
-    }
-  }
-
-  template <typename OtherAllocator>
-  device_vector &operator=(const ::std::vector<T, OtherAllocator> &v) {
-    resize(v.size());
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                v.begin(), v.end(), begin());
-    }
-    return *this;
-  }
-  device_vector &operator=(const device_vector &other) {
-    // Copy assignment operator:
-    resize(other.size());
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                other.begin(), other.end(), begin());
-    }
-    return *this;
-  }
-  device_vector &operator=(device_vector &&other) {
-    // Move assignment operator:
-    device_vector dummy(::std::move(other));
-    this->swap(dummy);
-    return *this;
-  }
-  size_type size() const { return _size; }
-  iterator begin() noexcept { return device_iterator<T>(_storage, 0); }
-  iterator end() { return device_iterator<T>(_storage, size()); }
-  const_iterator begin() const noexcept {
-    return device_iterator<T>(_storage, 0);
-  }
-  const_iterator cbegin() const noexcept { return begin(); }
-  const_iterator end() const { return device_iterator<T>(_storage, size()); }
-  const_iterator cend() const { return end(); }
-  T *real_begin() { return _storage; }
-  const T *real_begin() const { return _storage; }
-  void swap(device_vector &v) {
-    ::std::swap(_size, v._size);
-    ::std::swap(_capacity, v._capacity);
-    ::std::swap(_storage, v._storage);
-    ::std::swap(_alloc, v._alloc);
-  }
-  reference operator[](size_type n) { return _storage[n]; }
-  const_reference operator[](size_type n) const { return _storage[n]; }
-  void reserve(size_type n) {
-    if (n > capacity()) {
-      // allocate buffer for new size
-      auto tmp = _alloc.allocate(2 * n);
-      // copy content (old buffer to new buffer)
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                begin(), end(), tmp);
-      // deallocate old memory
-      _alloc.deallocate(_storage, _capacity);
-      _storage = tmp;
-      _capacity = 2 * n;
-    }
-  }
-  void resize(size_type new_size, const T &x = T()) {
-    reserve(new_size);
-    if (_size < new_size) {
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                begin() + _size, begin() + new_size, x);
-    }
-    _size = new_size;
-  }
-  size_type max_size(void) const {
-    return ::std::numeric_limits<size_type>::max() / sizeof(T);
-  }
-  size_type capacity() const { return _capacity; }
-  const_reference front() const { return *begin(); }
-  reference front() { return *begin(); }
-  const_reference back(void) const { return *(end() - 1); }
-  reference back(void) { return *(end() - 1); }
-  pointer data(void) { return _storage; }
-  const_pointer data(void) const { return _storage; }
-  void shrink_to_fit(void) {
-    if (_size != capacity()) {
-      size_type tmp_capacity = ::std::max(_size, _min_capacity());
-      auto tmp = _alloc.allocate(tmp_capacity);
-      if (_size > 0) {
-        ::std::copy(
-            oneapi::dpl::execution::make_device_policy(get_default_queue()),
-            begin(), end(), tmp);
-      }
-      _alloc.deallocate(_storage, _capacity);
-      _storage = tmp;
-      _capacity = tmp_capacity;
-    }
-  }
-  void assign(size_type n, const T &x) {
-    resize(n);
-    if (_size > 0) {
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                begin(), begin() + n, x);
-    }
-  }
-  template <typename InputIterator>
-  void
-  assign(InputIterator first,
-         typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
-                                 InputIterator>::type last) {
-    auto n = ::std::distance(first, last);
-    resize(n);
-    if (_size > 0) {
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                first, last, begin());
-    }
-  }
-  void clear(void) { _size = 0; }
-  bool empty(void) const { return (size() == 0); }
-  void push_back(const T &x) { insert(end(), size_type(1), x); }
-  void pop_back(void) {
-    if (_size > 0)
-      --_size;
-  }
-  iterator erase(iterator first, iterator last) {
-    auto n = ::std::distance(first, last);
-    if (last == end()) {
-      _size = _size - n;
-      return end();
-    }
-    auto m = ::std::distance(last, end());
-    if (m <= 0) {
-      return end();
-    }
-    auto tmp = _alloc.allocate(m);
-    // copy remainder to temporary buffer.
-    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              last, end(), tmp);
-    // override (erase) subsequence in storage.
-    ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              tmp, tmp + m, first);
-    _alloc.deallocate(tmp, m);
-    _size -= n;
-    return begin() + first.get_idx() + n;
-  }
-  iterator erase(iterator pos) { return erase(pos, pos + 1); }
-  iterator insert(iterator position, const T &x) {
-    auto n = ::std::distance(begin(), position);
-    insert(position, size_type(1), x);
-    return begin() + n;
-  }
-  void insert(iterator position, size_type n, const T &x) {
-    if (position == end()) {
-      resize(size() + n);
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                end() - n, end(), x);
-    } else {
-      auto i_n = ::std::distance(begin(), position);
-      // allocate temporary storage
-      auto m = ::std::distance(position, end());
-      // will throw if position is not inside active vector
-      auto tmp = _alloc.allocate(m);
-      // copy remainder
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                position, end(), tmp);
-
-      resize(size() + n);
-      // resizing might invalidate position
-      position = begin() + position.get_idx();
-
-      ::std::fill(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                position, position + n, x);
-
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                tmp, tmp + m, position + n);
-      _alloc.deallocate(tmp, m);
-    }
-  }
-  template <typename InputIterator>
-  void
-  insert(iterator position, InputIterator first,
-         typename ::std::enable_if<internal::is_iterator<InputIterator>::value,
-                                 InputIterator>::type last) {
-    auto n = ::std::distance(first, last);
-    if (position == end()) {
-      resize(size() + n);
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                first, last, end());
-    } else {
-      auto m = ::std::distance(position, end());
-      // will throw if position is not inside active vector
-      auto tmp = _alloc.allocate(m);
-
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                position, end(), tmp);
-
-      resize(size() + n);
-      // resizing might invalidate position
-      position = begin() + position.get_idx();
-
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                first, last, position);
-      ::std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-                tmp, tmp + m, position + n);
-      _alloc.deallocate(tmp, m);
-    }
-  }
-  Allocator get_allocator() const { return _alloc; }
-};
-
-#else
-
-template <typename T, typename Allocator = detail::__buffer_allocator<T>>
-class device_vector {
-  static_assert(
-      std::is_same<Allocator, detail::__buffer_allocator<T>>::value,
-      "device_vector doesn't support custom allocator when USM is not used.");
-
-public:
-  using iterator = device_iterator<T>;
-  using const_iterator = const iterator;
-  using reference = device_reference<T>;
-  using const_reference = const reference;
-  using value_type = T;
-  using pointer = T *;
-  using const_pointer = const T *;
-  using difference_type =
-      typename std::iterator_traits<iterator>::difference_type;
-  using size_type = std::size_t;
-
-private:
-  using Buffer = sycl::buffer<T, 1>;
-  using Range = sycl::range<1>;
-  // Using mem_mgr to handle memory allocation
-  void *_storage;
-  size_type _size;
-
-  size_type _min_capacity() const { return size_type(1); }
-
-  void *alloc_store(size_type num_bytes) {
-    return detail::mem_mgr::instance().mem_alloc(num_bytes);
-  }
-
-public:
-  template <typename OtherA> operator std::vector<T, OtherA>() const {
-    auto __tmp = std::vector<T, OtherA>(this->size());
-    std::copy(oneapi::dpl::execution::dpcpp_default, this->begin(), this->end(),
-              __tmp.begin());
-    return __tmp;
-  }
-  device_vector()
-      : _storage(alloc_store(_min_capacity() * sizeof(T))), _size(0) {}
-  ~device_vector() = default;
-  explicit device_vector(size_type n) : device_vector(n, T()) {}
-  explicit device_vector(size_type n, const T &value)
-      : _storage(alloc_store(std::max(n, _min_capacity()) * sizeof(T))),
-        _size(n) {
-    auto buf = get_buffer();
-    std::fill(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(buf),
-              oneapi::dpl::begin(buf) + n, T(value));
-  }
-  device_vector(const device_vector &other)
-      : _storage(other._storage), _size(other.size()) {}
-  device_vector(device_vector &&other)
-      : _storage(std::move(other._storage)), _size(other.size()) {}
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename std::enable_if<
-                    internal::is_iterator<InputIterator>::value &&
-                        !std::is_pointer<InputIterator>::value &&
-                        std::is_same<typename std::iterator_traits<
-                                         InputIterator>::iterator_category,
-                                     std::random_access_iterator_tag>::value,
-                    InputIterator>::type last)
-      : _storage(alloc_store(std::distance(first, last) * sizeof(T))),
-        _size(std::distance(first, last)) {
-    auto buf = get_buffer();
-    auto dst = oneapi::dpl::begin(buf);
-    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              first, last, dst);
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename std::enable_if<std::is_pointer<InputIterator>::value,
-                                        InputIterator>::type last)
-      : _storage(alloc_store(std::distance(first, last) * sizeof(T))),
-        _size(std::distance(first, last)) {
-    auto buf = get_buffer();
-    Buffer tmp_buf(first, last);
-    auto start = oneapi::dpl::begin(tmp_buf);
-    auto end = oneapi::dpl::end(tmp_buf);
-    auto dst = oneapi::dpl::begin(buf);
-    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              start, end, dst);
-  }
-
-  template <typename InputIterator>
-  device_vector(InputIterator first,
-                typename std::enable_if<
-                    internal::is_iterator<InputIterator>::value &&
-                        !std::is_same<typename std::iterator_traits<
-                                          InputIterator>::iterator_category,
-                                      std::random_access_iterator_tag>::value,
-                    InputIterator>::type last)
-      : _storage(alloc_store(std::distance(first, last) * sizeof(T))),
-        _size(std::distance(first, last)) {
-    auto buf = get_buffer();
-    std::vector<T> tmp(first, last);
-    Buffer tmp_buf(tmp);
-    auto start = oneapi::dpl::begin(tmp_buf);
-    auto end = oneapi::dpl::end(tmp_buf);
-    auto dst = oneapi::dpl::begin(buf);
-    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              start, end, dst);
-  }
-
-  template <typename OtherAllocator>
-  device_vector(const device_vector<T, OtherAllocator> &v)
-      : _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
-    auto buf = get_buffer();
-    auto dst = oneapi::dpl::begin(buf);
-    std::copy(oneapi::dpl::execution::make_device_policy(get_default_queue()),
-              v.real_begin(), v.real_begin() + v.size(), dst);
-  }
-
-  template <typename OtherAllocator>
-  device_vector(std::vector<T, OtherAllocator> &v)
-      : _storage(alloc_store(v.size() * sizeof(T))), _size(v.size()) {
-    std::copy(oneapi::dpl::execution::dpcpp_default, v.begin(), v.end(),
-              oneapi::dpl::begin(get_buffer()));
-  }
-
-  device_vector &operator=(const device_vector &other) {
-    // Copy assignment operator:
-    _size = other.size();
-    void *tmp = alloc_store(_size * sizeof(T));
-    auto tmp_buf =
-        detail::mem_mgr::instance()
-            .translate_ptr(tmp)
-            .buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
-    std::copy(oneapi::dpl::execution::dpcpp_default,
-              oneapi::dpl::begin(other.get_buffer()),
-              oneapi::dpl::end(other.get_buffer()),
-              oneapi::dpl::begin(tmp_buf));
-    detail::mem_mgr::instance().mem_free(_storage);
-    _storage = tmp;
-    return *this;
-  }
-  device_vector &operator=(device_vector &&other) {
-    // Move assignment operator:
-    _size = other.size();
-    this->_storage = std::move(other._storage);
-    return *this;
-  }
-  template <typename OtherAllocator>
-  device_vector &operator=(const std::vector<T, OtherAllocator> &v) {
-    Buffer data(v.begin(), v.end());
-    _size = v.size();
-    void *tmp = alloc_store(_size * sizeof(T));
-    auto tmp_buf =
-        detail::mem_mgr::instance()
-            .translate_ptr(tmp)
-            .buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
-    std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(data),
-              oneapi::dpl::end(data), oneapi::dpl::begin(tmp_buf));
-    detail::mem_mgr::instance().mem_free(_storage);
-    _storage = tmp;
-
-    return *this;
-  }
-  Buffer get_buffer() const {
-    return detail::mem_mgr::instance()
-        .translate_ptr(_storage)
-        .buffer.template reinterpret<T, 1>(sycl::range<1>(capacity()));
-  }
-  size_type size() const { return _size; }
-  iterator begin() noexcept { return device_iterator<T>(get_buffer(), 0); }
-  iterator end() { return device_iterator<T>(get_buffer(), _size); }
-  const_iterator begin() const noexcept {
-    return device_iterator<T>(get_buffer(), 0);
-  }
-  const_iterator cbegin() const noexcept { return begin(); }
-  const_iterator end() const { return device_iterator<T>(get_buffer(), _size); }
-  const_iterator cend() const { return end(); }
-  T *real_begin() {
-    return (detail::mem_mgr::instance()
-                .translate_ptr(_storage)
-                .buffer.template get_access<sycl::access_mode::read_write>())
-        .get_pointer();
-  }
-  const T *real_begin() const {
-    return const_cast<device_vector *>(this)
-        ->detail::mem_mgr::instance()
-        .translate_ptr(_storage)
-        .buffer.template get_access<sycl::access_mode::read_write>()
-        .get_pointer();
-  }
-  void swap(device_vector &v) {
-    void *temp = v._storage;
-    v._storage = this->_storage;
-    this->_storage = temp;
-    std::swap(_size, v._size);
-  }
-  reference operator[](size_type n) { return *(begin() + n); }
-  const_reference operator[](size_type n) const { return *(begin() + n); }
-  void reserve(size_type n) {
-    if (n > capacity()) {
-      // create new buffer (allocate for new size)
-      void *a = alloc_store(n * sizeof(T));
-
-      // copy content (old buffer to new buffer)
-      if (_storage != nullptr) {
-        auto tmp = detail::mem_mgr::instance()
-                       .translate_ptr(a)
-                       .buffer.template reinterpret<T, 1>(sycl::range<1>(n));
-        auto src_buf = get_buffer();
-        std::copy(oneapi::dpl::execution::dpcpp_default,
-                  oneapi::dpl::begin(src_buf), oneapi::dpl::end(src_buf),
-                  oneapi::dpl::begin(tmp));
-
-        // deallocate old memory
-        detail::mem_mgr::instance().mem_free(_storage);
-      }
-      _storage = a;
-    }
-  }
-  void resize(size_type new_size, const T &x = T()) {
-    reserve(new_size);
-    if (_size < new_size) {
-      auto src_buf = get_buffer();
-      std::fill(oneapi::dpl::execution::dpcpp_default,
-                oneapi::dpl::begin(src_buf) + _size,
-                oneapi::dpl::begin(src_buf) + new_size, x);
-    }
-    _size = new_size;
-  }
-  size_type max_size(void) const {
-    return std::numeric_limits<size_type>::max() / sizeof(T);
-  }
-  size_type capacity() const {
-    return _storage != nullptr ? detail::mem_mgr::instance()
-                                         .translate_ptr(_storage)
-                                         .buffer.size() /
-                                     sizeof(T)
-                               : 0;
-  }
-  const_reference front() const { return *begin(); }
-  reference front() { return *begin(); }
-  const_reference back(void) const { return *(end() - 1); }
-  reference back(void) { return *(end() - 1); }
-  pointer data(void) { return reinterpret_cast<pointer>(_storage); }
-  const_pointer data(void) const {
-    return reinterpret_cast<const_pointer>(_storage);
-  }
-  void shrink_to_fit(void) {
-    if (_size != capacity()) {
-      void *a = alloc_store(_size * sizeof(T));
-      auto tmp = detail::mem_mgr::instance()
-                     .translate_ptr(a)
-                     .buffer.template reinterpret<T, 1>(sycl::range<1>(_size));
-      std::copy(oneapi::dpl::execution::dpcpp_default,
-                oneapi::dpl::begin(get_buffer()),
-                oneapi::dpl::begin(get_buffer()) + _size,
-                oneapi::dpl::begin(tmp));
-      detail::mem_mgr::instance().mem_free(_storage);
-      _storage = a;
-    }
-  }
-  void assign(size_type n, const T &x) {
-    resize(n);
-    std::fill(oneapi::dpl::execution::dpcpp_default, begin(), begin() + n, x);
-  }
-  template <typename InputIterator>
-  void
-  assign(InputIterator first,
-         typename std::enable_if<internal::is_iterator<InputIterator>::value,
-                                 InputIterator>::type last) {
-    auto n = std::distance(first, last);
-    resize(n);
-    if (internal::is_iterator<InputIterator>::value &&
-        !std::is_pointer<InputIterator>::value)
-      std::copy(oneapi::dpl::execution::dpcpp_default, first, last, begin());
-    else {
-      Buffer tmp(first, last);
-      std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
-                oneapi::dpl::end(tmp), begin());
-    }
-  }
-  void clear(void) {
-    _size = 0;
-    detail::mem_mgr::instance().mem_free(_storage);
-    _storage = nullptr;
-  }
-  bool empty(void) const { return (size() == 0); }
-  void push_back(const T &x) { insert(end(), size_type(1), x); }
-  void pop_back(void) {
-    if (_size > 0)
-      --_size;
-  }
-  iterator erase(iterator first, iterator last) {
-    auto n = std::distance(first, last);
-    if (last == end()) {
-      _size = _size - n;
-      return end();
-    }
-    Buffer tmp{Range(std::distance(last, end()))};
-    // copy remainder to temporary buffer.
-    std::copy(oneapi::dpl::execution::dpcpp_default, last, end(),
-              oneapi::dpl::begin(tmp));
-    // override (erase) subsequence in storage.
-    std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
-              oneapi::dpl::end(tmp), first);
-    resize(_size - n);
-    return begin() + first.get_idx() + n;
-  }
-  iterator erase(iterator pos) { return erase(pos, pos + 1); }
-  iterator insert(iterator position, const T &x) {
-    auto n = std::distance(begin(), position);
-    insert(position, size_type(1), x);
-    return begin() + n;
-  }
-  void insert(iterator position, size_type n, const T &x) {
-    if (position == end()) {
-      resize(size() + n);
-      std::fill(oneapi::dpl::execution::dpcpp_default, end() - n, end(), x);
-    } else {
-      auto i_n = std::distance(begin(), position);
-      // allocate temporary storage
-      Buffer tmp{Range(std::distance(position, end()))};
-      // copy remainder
-      std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
-                oneapi::dpl::begin(tmp));
-
-      resize(size() + n);
-      // resizing might invalidate position
-      position = begin() + position.get_idx();
-
-      std::fill(oneapi::dpl::execution::dpcpp_default, position, position + n,
-                x);
-
-      std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
-                oneapi::dpl::end(tmp), position + n);
-    }
-  }
-  template <typename InputIterator>
-  void
-  insert(iterator position, InputIterator first,
-         typename std::enable_if<internal::is_iterator<InputIterator>::value,
-                                 InputIterator>::type last) {
-    auto n = std::distance(first, last);
-    if (position == end()) {
-      resize(size() + n);
-      std::copy(oneapi::dpl::execution::dpcpp_default, first, last, end());
-    } else {
-      Buffer tmp{Range(std::distance(position, end()))};
-
-      std::copy(oneapi::dpl::execution::dpcpp_default, position, end(),
-                oneapi::dpl::begin(tmp));
-
-      resize(size() + n);
-      // resizing might invalidate position
-      position = begin() + position.get_idx();
-
-      std::copy(oneapi::dpl::execution::dpcpp_default, first, last, position);
-      std::copy(oneapi::dpl::execution::dpcpp_default, oneapi::dpl::begin(tmp),
-                oneapi::dpl::end(tmp), position + n);
-    }
-  }
-};
-
-#endif
-
-} // end namespace dpct
-
-#endif
diff --git a/dpct/dpl_utils.hpp b/dpct/dpl_utils.hpp
deleted file mode 100644
index 79a6e74048f33..0000000000000
--- a/dpct/dpl_utils.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//==---- dpl_utils.hpp ----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_DPL_UTILS_HPP__
-#define __DPCT_DPL_UTILS_HPP__
-
-#define ONEDPL_USE_DPCPP_BACKEND 1
-#define __USE_DPCT 1
-
-#include <oneapi/dpl/execution>
-#include <oneapi/dpl/algorithm>
-#include <oneapi/dpl/numeric>
-
-#include "dpl_extras/memory.h"
-#include "dpl_extras/algorithm.h"
-#include "dpl_extras/numeric.h"
-#include "dpl_extras/iterators.h"
-#include "dpl_extras/vector.h"
-#include "dpl_extras/dpcpp_extensions.h"
-
-#endif // __DPCT_DPL_UTILS_HPP__
diff --git a/dpct/fft_utils.hpp b/dpct/fft_utils.hpp
deleted file mode 100644
index cba1b253cecaf..0000000000000
--- a/dpct/fft_utils.hpp
+++ /dev/null
@@ -1,1376 +0,0 @@
-//==---- fft_utils.hpp ----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_FFT_UTILS_HPP__
-#define __DPCT_FFT_UTILS_HPP__
-
-#include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
-#include <optional>
-#include <utility>
-#include "lib_common_utils.hpp"
-
-namespace dpct {
-namespace fft {
-/// An enumeration type to describe the FFT direction is forward or backward.
-enum fft_direction : int {
-  forward = 0,
-  backward
-};
-/// An enumeration type to describe the types of FFT input and output data.
-enum fft_type : int {
-  real_float_to_complex_float = 0,
-  complex_float_to_real_float,
-  real_double_to_complex_double,
-  complex_double_to_real_double,
-  complex_float_to_complex_float,
-  complex_double_to_complex_double,
-};
-
-/// A class to perform FFT calculation.
-class fft_engine {
-public:
-  /// Default constructor.
-  fft_engine() {}
-  /// Commit the configuration to calculate n-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] dim Dimension number of the data.
-  /// \param [in] n Pointer to an array containing each dimension's size.
-  /// \param [in] inembed Pointer to an array containing each dimension's size
-  /// of the embedded input data.
-  /// \param [in] istride Stride size of the input data.
-  /// \param [in] idist Distance between the two batches of the input data.
-  /// \param [in] input_type Input data type.
-  /// \param [in] onembed Pointer to an array containing each dimension's size
-  /// of the embedded output data.
-  /// \param [in] ostride Stride size of the output data.
-  /// \param [in] odist Distance between the two batches of the output data.
-  /// \param [in] output_type Output data type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [out] scratchpad_size The workspace size required for this FFT.
-  /// If this value is used to allocate memory, \p direction_and_placement need
-  /// to be specified explicitly to get correct result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  void commit(sycl::queue *exec_queue, int dim, long long *n,
-              long long *inembed, long long istride, long long idist,
-              library_data_t input_type, long long *onembed, long long ostride,
-              long long odist, library_data_t output_type, long long batch,
-              size_t *scratchpad_size,
-              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                  direction_and_placement = std::nullopt) {
-    _q = exec_queue;
-    init<long long>(dim, n, inembed, istride, idist, input_type, onembed,
-                    ostride, odist, output_type, batch,
-                    direction_and_placement);
-    if (scratchpad_size) {
-      if (_is_estimate_call)
-        *scratchpad_size = _workspace_estimate_bytes;
-      else
-        *scratchpad_size = _workspace_bytes;
-    }
-  }
-  /// Commit the configuration to calculate n-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] dim Dimension number of the data.
-  /// \param [in] n Pointer to an array containing each dimension's size.
-  /// \param [in] inembed Pointer to an array containing each dimension's size
-  /// of the embedded input data.
-  /// \param [in] istride Stride size of the input data.
-  /// \param [in] idist Distance between the two batches of the input data.
-  /// \param [in] input_type Input data type.
-  /// \param [in] onembed Pointer to an array containing each dimension's size
-  /// of the embedded output data.
-  /// \param [in] ostride Stride size of the output data.
-  /// \param [in] odist Distance between the two batches of the output data.
-  /// \param [in] output_type Output data type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [out] scratchpad_size The workspace size required for this FFT.
-  /// If this value is used to allocate memory, \p direction_and_placement need
-  /// to be specified explicitly to get correct result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  void commit(sycl::queue *exec_queue, int dim, int *n, int *inembed,
-              int istride, int idist, library_data_t input_type, int *onembed,
-              int ostride, int odist, library_data_t output_type, int batch,
-              size_t *scratchpad_size,
-              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                  direction_and_placement = std::nullopt) {
-    _q = exec_queue;
-    init<int>(dim, n, inembed, istride, idist, input_type, onembed, ostride,
-              odist, output_type, batch, direction_and_placement);
-    if (scratchpad_size) {
-      if (_is_estimate_call)
-        *scratchpad_size = _workspace_estimate_bytes;
-      else
-        *scratchpad_size = _workspace_bytes;
-    }
-  }
-  /// Commit the configuration to calculate n-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] dim Dimension number of the data.
-  /// \param [in] n Pointer to an array containing each dimension's size.
-  /// \param [in] inembed Pointer to an array containing each dimension's size
-  /// of the embedded input data.
-  /// \param [in] istride Stride size of the input data.
-  /// \param [in] idist Distance between the two batches of the input data.
-  /// \param [in] onembed Pointer to an array containing each dimension's size
-  /// of the embedded output data.
-  /// \param [in] ostride Stride size of the output data.
-  /// \param [in] odist Distance between the two batches of the output data.
-  /// \param [in] type The FFT type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [out] scratchpad_size The workspace size required for this FFT.
-  /// If this value is used to allocate memory, \p direction_and_placement need
-  /// to be specified explicitly to get correct result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  void commit(sycl::queue *exec_queue, int dim, long long *n,
-              long long *inembed, long long istride, long long idist,
-              long long *onembed, long long ostride, long long odist,
-              fft_type type, long long batch, size_t *scratchpad_size,
-              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                  direction_and_placement = std::nullopt) {
-    commit(exec_queue, dim, n, inembed, istride, idist,
-           fft_type_to_data_type(type).first, onembed, ostride, odist,
-           fft_type_to_data_type(type).second, batch, scratchpad_size,
-           direction_and_placement);
-  }
-  /// Commit the configuration to calculate n-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] dim Dimension number of the data.
-  /// \param [in] n Pointer to an array containing each dimension's size.
-  /// \param [in] inembed Pointer to an array containing each dimension's size
-  /// of the embedded input data.
-  /// \param [in] istride Stride size of the input data.
-  /// \param [in] idist Distance between the two batches of the input data.
-  /// \param [in] onembed Pointer to an array containing each dimension's size
-  /// of the embedded output data.
-  /// \param [in] ostride Stride size of the output data.
-  /// \param [in] odist Distance between the two batches of the output data.
-  /// \param [in] type The FFT type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [out] scratchpad_size The workspace size required for this FFT.
-  /// If this value is used to allocate memory, \p direction_and_placement need
-  /// to be specified explicitly to get correct result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  void commit(sycl::queue *exec_queue, int dim, int *n, int *inembed,
-              int istride, int idist, int *onembed, int ostride, int odist,
-              fft_type type, int batch, size_t *scratchpad_size,
-              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                  direction_and_placement = std::nullopt) {
-    commit(exec_queue, dim, n, inembed, istride, idist,
-           fft_type_to_data_type(type).first, onembed, ostride, odist,
-           fft_type_to_data_type(type).second, batch, scratchpad_size,
-           direction_and_placement);
-  }
-  /// Commit the configuration to calculate 1-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] n1 The size of the dimension of the data.
-  /// \param [in] type The FFT type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [out] scratchpad_size The workspace size required for this FFT.
-  /// If this value is used to allocate memory, \p direction_and_placement need
-  /// to be specified explicitly to get correct result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  void commit(sycl::queue *exec_queue, int n1, fft_type type, int batch,
-              size_t *scratchpad_size,
-              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                  direction_and_placement = std::nullopt) {
-    _q = exec_queue;
-    _n.resize(1);
-    _n[0] = n1;
-    std::tie(_input_type, _output_type) = fft_type_to_data_type(type);
-    _dim = 1;
-    _batch = batch;
-    _is_basic = true;
-    if (direction_and_placement.has_value()) {
-      _is_user_specified_dir_and_placement = true;
-      _direction = direction_and_placement->first;
-      _is_inplace = direction_and_placement->second;
-    }
-    config_and_commit_basic();
-    if (scratchpad_size) {
-      if (_is_estimate_call)
-        *scratchpad_size = _workspace_estimate_bytes;
-      else
-        *scratchpad_size = _workspace_bytes;
-    }
-  }
-  /// Commit the configuration to calculate 2-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] n2 The size of the 2nd dimension (outermost) of the data.
-  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
-  /// \param [in] type The FFT type.
-  /// \param [out] scratchpad_size The workspace size required for this FFT.
-  /// If this value is used to allocate memory, \p direction_and_placement need
-  /// to be specified explicitly to get correct result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  void commit(sycl::queue *exec_queue, int n2, int n1, fft_type type,
-              size_t *scratchpad_size,
-              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                  direction_and_placement = std::nullopt) {
-    _q = exec_queue;
-    _n.resize(2);
-    _n[0] = n2;
-    _n[1] = n1;
-    std::tie(_input_type, _output_type) = fft_type_to_data_type(type);
-    _dim = 2;
-    _is_basic = true;
-    if (direction_and_placement.has_value()) {
-      _is_user_specified_dir_and_placement = true;
-      _direction = direction_and_placement->first;
-      _is_inplace = direction_and_placement->second;
-    }
-    config_and_commit_basic();
-    if (scratchpad_size) {
-      if (_is_estimate_call)
-        *scratchpad_size = _workspace_estimate_bytes;
-      else
-        *scratchpad_size = _workspace_bytes;
-    }
-  }
-  /// Commit the configuration to calculate 3-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] n3 The size of the 3rd dimension (outermost) of the data.
-  /// \param [in] n2 The size of the 2nd dimension of the data.
-  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
-  /// \param [in] type The FFT type.
-  /// \param [out] scratchpad_size The workspace size required for this FFT.
-  /// If this value is used to allocate memory, \p direction_and_placement need
-  /// to be specified explicitly to get correct result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  void commit(sycl::queue *exec_queue, int n3, int n2, int n1, fft_type type,
-              size_t *scratchpad_size,
-              std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                  direction_and_placement = std::nullopt) {
-    _q = exec_queue;
-    _n.resize(3);
-    _n[0] = n3;
-    _n[1] = n2;
-    _n[2] = n1;
-    std::tie(_input_type, _output_type) = fft_type_to_data_type(type);
-    _dim = 3;
-    _is_basic = true;
-    if (direction_and_placement.has_value()) {
-      _is_user_specified_dir_and_placement = true;
-      _direction = direction_and_placement->first;
-      _is_inplace = direction_and_placement->second;
-    }
-    config_and_commit_basic();
-    if (scratchpad_size) {
-      if (_is_estimate_call)
-        *scratchpad_size = _workspace_estimate_bytes;
-      else
-        *scratchpad_size = _workspace_bytes;
-    }
-  }
-
-  /// Create the class for calculate 1-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] n1 The size of the dimension of the data.
-  /// \param [in] type The FFT type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  static fft_engine *
-  create(sycl::queue *exec_queue, int n1, fft_type type, int batch,
-         std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-             direction_and_placement = std::nullopt) {
-    fft_engine *engine = new fft_engine();
-    engine->commit(exec_queue, n1, type, batch, nullptr,
-                   direction_and_placement);
-    return engine;
-  }
-  /// Create the class for calculate 2-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] n2 The size of the 2nd dimension (outermost) of the data.
-  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
-  /// \param [in] type The FFT type.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  static fft_engine *
-  create(sycl::queue *exec_queue, int n2, int n1, fft_type type,
-         std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-             direction_and_placement = std::nullopt) {
-    fft_engine *engine = new fft_engine();
-    engine->commit(exec_queue, n2, n1, type, nullptr, direction_and_placement);
-    return engine;
-  }
-  /// Create the class for calculate 3-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] n3 The size of the 3rd dimension (outermost) of the data.
-  /// \param [in] n2 The size of the 2nd dimension of the data.
-  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
-  /// \param [in] type The FFT type.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  static fft_engine *
-  create(sycl::queue *exec_queue, int n3, int n2, int n1, fft_type type,
-         std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-             direction_and_placement = std::nullopt) {
-    fft_engine *engine = new fft_engine();
-    engine->commit(exec_queue, n3, n2, n1, type, nullptr,
-                   direction_and_placement);
-    return engine;
-  }
-  /// Create the class for calculate n-D FFT.
-  /// \param [in] exec_queue The queue where the calculation should be executed.
-  /// \param [in] dim Dimension number of the data.
-  /// \param [in] n Pointer to an array containing each dimension's size.
-  /// \param [in] inembed Pointer to an array containing each dimension's size
-  /// of the embedded input data.
-  /// \param [in] istride Stride size of the input data.
-  /// \param [in] idist Distance between the two batches of the input data.
-  /// \param [in] onembed Pointer to an array containing each dimension's size
-  /// of the embedded output data.
-  /// \param [in] ostride Stride size of the output data.
-  /// \param [in] odist Distance between the two batches of the output data.
-  /// \param [in] type The FFT type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If this value is specified, the direction parameter
-  /// will be ignored in the fft_engine::compute function. If it is not set,
-  /// forward direction(if current FFT is complex-to-complex) and out-of-place
-  /// (false) are set by default.
-  static fft_engine *
-  create(sycl::queue *exec_queue, int dim, int *n, int *inembed, int istride,
-         int idist, int *onembed, int ostride, int odist, fft_type type,
-         int batch,
-         std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-             direction_and_placement = std::nullopt) {
-    fft_engine *engine = new fft_engine();
-    engine->commit(exec_queue, dim, n, inembed, istride, idist, onembed,
-                   ostride, odist, type, batch, nullptr,
-                   direction_and_placement);
-    return engine;
-  }
-  /// Create the class for calculate FFT without commit any config.
-  static fft_engine *create() {
-    fft_engine *engine = new fft_engine();
-    return engine;
-  }
-  /// Destroy the class for calculate FFT.
-  /// \param [in] engine Pointer returned from fft_engine::craete.
-  static void destroy(fft_engine *engine) { delete engine; }
-
-#ifdef __INTEL_MKL__
-  /// Estimates the workspace size for calculating n-D FFT.
-  /// \param [in] dim Dimension number of the data.
-  /// \param [in] n Pointer to an array containing each dimension's size.
-  /// \param [in] inembed Pointer to an array containing each dimension's size
-  /// of the embedded input data.
-  /// \param [in] istride Stride size of the input data.
-  /// \param [in] idist Distance between the two batches of the input data.
-  /// \param [in] onembed Pointer to an array containing each dimension's size
-  /// of the embedded output data.
-  /// \param [in] ostride Stride size of the output data.
-  /// \param [in] odist Distance between the two batches of the output data.
-  /// \param [in] type The FFT type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [out] estimated_scratchpad_size The estimated workspace size
-  /// required for this FFT. If this value is used to allocate memory,
-  /// \p direction_and_placement need to be specified explicitly to get correct
-  /// result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT
-  /// direction and placement info. If it is not set, forward direction(if
-  /// current FFT is complex-to-complex) and out-of-place (false) are set by default.
-  static void
-  estimate_size(int dim, long long *n, long long *inembed, long long istride,
-                long long idist, long long *onembed, long long ostride,
-                long long odist, fft_type type, long long batch,
-                size_t *estimated_scratchpad_size,
-                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                    direction_and_placement = std::nullopt) {
-    fft_engine *engine = fft_engine::create();
-    engine->_is_estimate_call = true;
-    engine->commit(&dpct::get_default_queue(), dim, n, inembed, istride, idist,
-                   fft_type_to_data_type(type).first, onembed, ostride, odist,
-                   fft_type_to_data_type(type).second, batch,
-                   estimated_scratchpad_size, direction_and_placement);
-    fft_engine::destroy(engine);
-  }
-  /// Estimates the workspace size for calculating n-D FFT.
-  /// \param [in] dim Dimension number of the data.
-  /// \param [in] n Pointer to an array containing each dimension's size.
-  /// \param [in] inembed Pointer to an array containing each dimension's size
-  /// of the embedded input data.
-  /// \param [in] istride Stride size of the input data.
-  /// \param [in] idist Distance between the two batches of the input data.
-  /// \param [in] onembed Pointer to an array containing each dimension's size
-  /// of the embedded output data.
-  /// \param [in] ostride Stride size of the output data.
-  /// \param [in] odist Distance between the two batches of the output data.
-  /// \param [in] type The FFT type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [out] estimated_scratchpad_size The estimated workspace size
-  /// required for this FFT. If this value is used to allocate memory,
-  /// \p direction_and_placement need to be specified explicitly to get correct
-  /// result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT
-  /// direction and placement info. If it is not set, forward direction(if
-  /// current FFT is complex-to-complex) and out-of-place (false) are set by default.
-  static void
-  estimate_size(int dim, int *n, int *inembed, int istride, int idist,
-                int *onembed, int ostride, int odist, fft_type type, int batch,
-                size_t *estimated_scratchpad_size,
-                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                    direction_and_placement = std::nullopt) {
-    fft_engine *engine = fft_engine::create();
-    engine->_is_estimate_call = true;
-    engine->commit(&dpct::get_default_queue(), dim, n, inembed, istride, idist,
-                   fft_type_to_data_type(type).first, onembed, ostride, odist,
-                   fft_type_to_data_type(type).second, batch,
-                   estimated_scratchpad_size, direction_and_placement);
-    fft_engine::destroy(engine);
-  }
-  /// Estimates the workspace size for calculating 1-D FFT.
-  /// \param [in] n1 The size of the dimension of the data.
-  /// \param [in] type The FFT type.
-  /// \param [in] batch The number of FFT operations to perform.
-  /// \param [out] estimated_scratchpad_size The estimated workspace size
-  /// required for this FFT. If this value is used to allocate memory,
-  /// \p direction_and_placement need to be specified explicitly to get correct
-  /// result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT direction
-  /// and placement info. If it is not set, forward direction(if current FFT is
-  /// complex-to-complex) and out-of-place (false) are set by default.
-  static void
-  estimate_size(int n1, fft_type type, int batch,
-                size_t *estimated_scratchpad_size,
-                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                    direction_and_placement = std::nullopt) {
-    fft_engine *engine = fft_engine::create();
-    engine->_is_estimate_call = true;
-    engine->commit(&dpct::get_default_queue(), n1, type, batch,
-                   estimated_scratchpad_size, direction_and_placement);
-    fft_engine::destroy(engine);
-  }
-  /// Estimates the workspace size for calculating 2-D FFT.
-  /// \param [in] n2 The size of the 2nd dimension (outermost) of the data.
-  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
-  /// \param [in] type The FFT type.
-  /// \param [out] estimated_scratchpad_size The estimated workspace size
-  /// required for this FFT. If this value is used to allocate memory,
-  /// \p direction_and_placement need to be specified explicitly to get correct
-  /// result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT
-  /// direction and placement info. If it is not set, forward direction(if
-  /// current FFT is complex-to-complex) and out-of-place (false) are set by default.
-  static void
-  estimate_size(int n2, int n1, fft_type type,
-                size_t *estimated_scratchpad_size,
-                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                    direction_and_placement = std::nullopt) {
-    fft_engine *engine = fft_engine::create();
-    engine->_is_estimate_call = true;
-    engine->commit(&dpct::get_default_queue(), n2, n1, type,
-                   estimated_scratchpad_size, direction_and_placement);
-    fft_engine::destroy(engine);
-  }
-  /// Estimates the workspace size for calculating 3-D FFT.
-  /// \param [in] n3 The size of the 3rd dimension (outermost) of the data.
-  /// \param [in] n2 The size of the 2nd dimension of the data.
-  /// \param [in] n1 The size of the 1st dimension (innermost) of the data.
-  /// \param [in] type The FFT type.
-  /// \param [out] estimated_scratchpad_size The estimated workspace size
-  /// required for this FFT. If this value is used to allocate memory,
-  /// \p direction_and_placement need to be specified explicitly to get correct
-  /// result.
-  /// \param [in] direction_and_placement Explicitly specify the FFT
-  /// direction and placement info. If it is not set, forward direction(if
-  /// current FFT is complex-to-complex) and out-of-place (false) are set by default.
-  static void
-  estimate_size(int n3, int n2, int n1, fft_type type,
-                size_t *estimated_scratchpad_size,
-                std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                    direction_and_placement = std::nullopt) {
-    fft_engine *engine = fft_engine::create();
-    engine->_is_estimate_call = true;
-    engine->commit(&dpct::get_default_queue(), n3, n2, n1, type,
-                   estimated_scratchpad_size, direction_and_placement);
-    fft_engine::destroy(engine);
-  }
-#endif
-
-  /// Execute the FFT calculation.
-  /// \param [in] input Pointer to the input data.
-  /// \param [out] output Pointer to the output data.
-  /// \param [in] direction The FFT direction.
-  template <typename input_t, typename output_t>
-  void compute(input_t *input, output_t *output, fft_direction direction) {
-    if (_input_type == library_data_t::complex_float &&
-        _output_type == library_data_t::complex_float) {
-      compute_complex<float, oneapi::mkl::dft::precision::SINGLE>(
-          (float *)input, (float *)output, direction);
-    } else if (_input_type == library_data_t::complex_double &&
-               _output_type == library_data_t::complex_double) {
-      compute_complex<double, oneapi::mkl::dft::precision::DOUBLE>(
-          (double *)input, (double *)output, direction);
-    } else if (_input_type == library_data_t::real_float &&
-               _output_type == library_data_t::complex_float) {
-      _direction = direction;
-      compute_real<float, oneapi::mkl::dft::precision::SINGLE>((float *)input,
-                                                               (float *)output);
-    } else if (_input_type == library_data_t::complex_float &&
-               _output_type == library_data_t::real_float) {
-      _direction = direction;
-      compute_real<float, oneapi::mkl::dft::precision::SINGLE>((float *)input,
-                                                               (float *)output);
-    } else if (_input_type == library_data_t::real_double &&
-               _output_type == library_data_t::complex_double) {
-      _direction = direction;
-      compute_real<double, oneapi::mkl::dft::precision::DOUBLE>(
-          (double *)input, (double *)output);
-    } else if (_input_type == library_data_t::complex_double &&
-               _output_type == library_data_t::real_double) {
-      _direction = direction;
-      compute_real<double, oneapi::mkl::dft::precision::DOUBLE>(
-          (double *)input, (double *)output);
-    }
-  }
-  template <>
-  void compute(float *input, sycl::float2 *output, fft_direction direction) {
-    _direction = direction;
-    compute_real<float, oneapi::mkl::dft::precision::SINGLE>((float *)input,
-                                                             (float *)output);
-  }
-  template <>
-  void compute(sycl::float2 *input, float *output, fft_direction direction) {
-    _direction = direction;
-    compute_real<float, oneapi::mkl::dft::precision::SINGLE>((float *)input,
-                                                             (float *)output);
-  }
-  template <>
-  void compute(double *input, sycl::double2 *output, fft_direction direction) {
-    _direction = direction;
-    compute_real<double, oneapi::mkl::dft::precision::DOUBLE>((double *)input,
-                                                              (double *)output);
-  }
-  template <>
-  void compute(sycl::double2 *input, double *output, fft_direction direction) {
-    _direction = direction;
-    compute_real<double, oneapi::mkl::dft::precision::DOUBLE>((double *)input,
-                                                              (double *)output);
-  }
-  template <>
-  void compute(sycl::float2 *input, sycl::float2 *output,
-               fft_direction direction) {
-    compute_complex<float, oneapi::mkl::dft::precision::SINGLE>(
-        (float *)input, (float *)output, direction);
-  }
-  template <>
-  void compute(sycl::double2 *input, sycl::double2 *output,
-               fft_direction direction) {
-    compute_complex<double, oneapi::mkl::dft::precision::DOUBLE>(
-        (double *)input, (double *)output, direction);
-  }
-  /// Setting the user's SYCL queue for calculation.
-  /// \param [in] q Pointer to the SYCL queue.
-  void set_queue(sycl::queue *q) { _q = q; }
-#ifdef __INTEL_MKL__
-  /// Setting whether to use external or internal workspace.
-  /// \param [in] flag True means using internal workspace. False means using
-  /// external workspace.
-  void use_internal_workspace(bool flag = true) {
-    _use_external_workspace = !flag;
-  }
-  /// Specify the external workspace.
-  /// \param [in] ptr Pointer to the workspace.
-  void set_workspace(void *ptr) {
-    if (!_use_external_workspace) {
-      return;
-    }
-    if (_input_type == library_data_t::complex_float &&
-        _output_type == library_data_t::complex_float) {
-      if (_q->get_device().is_gpu()) {
-        auto data = dpct::detail::get_memory<float>(ptr);
-        _desc_sc->set_workspace(data);
-      }
-    } else if (_input_type == library_data_t::complex_double &&
-               _output_type == library_data_t::complex_double) {
-      if (_q->get_device().is_gpu()) {
-        auto data = dpct::detail::get_memory<double>(ptr);
-        _desc_dc->set_workspace(data);
-      }
-    } else if ((_input_type == library_data_t::real_float &&
-                _output_type == library_data_t::complex_float) ||
-               (_input_type == library_data_t::complex_float &&
-                _output_type == library_data_t::real_float)) {
-      if (_q->get_device().is_gpu()) {
-        auto data = dpct::detail::get_memory<float>(ptr);
-        _desc_sr->set_workspace(data);
-      }
-    } else if ((_input_type == library_data_t::real_double &&
-                _output_type == library_data_t::complex_double) ||
-               (_input_type == library_data_t::complex_double &&
-                _output_type == library_data_t::real_double)) {
-      if (_q->get_device().is_gpu()) {
-        auto data = dpct::detail::get_memory<double>(ptr);
-        _desc_dr->set_workspace(data);
-      }
-    } else {
-      throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
-                            "invalid fft type");
-    }
-  }
-#endif
-  /// Get the workspace size.
-  /// \param [out] scratchpad_size Workspace size in bytes.
-  void get_workspace_size(size_t *scratchpad_size) {
-    if (scratchpad_size) {
-      *scratchpad_size = _workspace_bytes;
-    }
-  }
-
-private:
-  static std::pair<library_data_t, library_data_t>
-  fft_type_to_data_type(fft_type type) {
-    switch (type) {
-    case fft_type::real_float_to_complex_float: {
-      return std::make_pair(library_data_t::real_float,
-                            library_data_t::complex_float);
-    }
-    case fft_type::complex_float_to_real_float: {
-      return std::make_pair(library_data_t::complex_float,
-                            library_data_t::real_float);
-    }
-    case fft_type::real_double_to_complex_double: {
-      return std::make_pair(library_data_t::real_double,
-                            library_data_t::complex_double);
-    }
-    case fft_type::complex_double_to_real_double: {
-      return std::make_pair(library_data_t::complex_double,
-                            library_data_t::real_double);
-    }
-    case fft_type::complex_float_to_complex_float: {
-      return std::make_pair(library_data_t::complex_float,
-                            library_data_t::complex_float);
-    }
-    case fft_type::complex_double_to_complex_double: {
-      return std::make_pair(library_data_t::complex_double,
-                            library_data_t::complex_double);
-    }
-    }
-  }
-
-  void config_and_commit_basic() {
-    if (_input_type == library_data_t::complex_float &&
-        _output_type == library_data_t::complex_float) {
-      _desc_sc = std::make_shared<
-          oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE,
-                                       oneapi::mkl::dft::domain::COMPLEX>>(_n);
-      std::int64_t distance = 1;
-      for (auto i : _n)
-        distance = distance * i;
-      _fwd_dist = distance;
-      _bwd_dist = distance;
-      _desc_sc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
-                          distance);
-      _desc_sc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
-                          distance);
-      _desc_sc->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                          _batch);
-#ifdef __INTEL_MKL__
-      if (_is_user_specified_dir_and_placement && _is_inplace)
-        _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            DFTI_CONFIG_VALUE::DFTI_INPLACE);
-      else
-        _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
-      if (_use_external_workspace) {
-        if (_q->get_device().is_gpu()) {
-          _desc_sc->set_value(
-              oneapi::mkl::dft::config_param::WORKSPACE,
-              oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
-        }
-      }
-      if (_is_estimate_call) {
-        if (_q->get_device().is_gpu()) {
-          _desc_sc->get_value(
-              oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,
-              &_workspace_estimate_bytes);
-        }
-      } else {
-        _desc_sc->commit(*_q);
-        if (_q->get_device().is_gpu()) {
-          _desc_sc->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
-                              &_workspace_bytes);
-        }
-      }
-#else
-      if (_is_user_specified_dir_and_placement && _is_inplace)
-        _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            oneapi::mkl::dft::config_value::INPLACE);
-      else
-        _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            oneapi::mkl::dft::config_value::NOT_INPLACE);
-      _desc_sc->commit(*_q);
-#endif
-    } else if (_input_type == library_data_t::complex_double &&
-               _output_type == library_data_t::complex_double) {
-      _desc_dc = std::make_shared<
-          oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE,
-                                       oneapi::mkl::dft::domain::COMPLEX>>(_n);
-      std::int64_t distance = 1;
-      for (auto i : _n)
-        distance = distance * i;
-      _fwd_dist = distance;
-      _bwd_dist = distance;
-      _desc_dc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
-                          distance);
-      _desc_dc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
-                          distance);
-      _desc_dc->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                          _batch);
-#ifdef __INTEL_MKL__
-      if (_is_user_specified_dir_and_placement && _is_inplace)
-        _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            DFTI_CONFIG_VALUE::DFTI_INPLACE);
-      else
-        _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
-      if (_use_external_workspace) {
-        if (_q->get_device().is_gpu()) {
-          _desc_dc->set_value(
-              oneapi::mkl::dft::config_param::WORKSPACE,
-              oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
-        }
-      }
-      if (_is_estimate_call) {
-        if (_q->get_device().is_gpu()) {
-          _desc_dc->get_value(
-              oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,
-              &_workspace_estimate_bytes);
-        }
-      } else {
-        _desc_dc->commit(*_q);
-        if (_q->get_device().is_gpu()) {
-          _desc_dc->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
-                              &_workspace_bytes);
-        }
-      }
-#else
-      if (_is_user_specified_dir_and_placement && _is_inplace)
-        _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            oneapi::mkl::dft::config_value::INPLACE);
-      else
-        _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            oneapi::mkl::dft::config_value::NOT_INPLACE);
-      _desc_dc->commit(*_q);
-#endif
-    } else if ((_input_type == library_data_t::real_float &&
-                _output_type == library_data_t::complex_float) ||
-               (_input_type == library_data_t::complex_float &&
-                _output_type == library_data_t::real_float)) {
-      _desc_sr = std::make_shared<oneapi::mkl::dft::descriptor<
-          oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>>(
-          _n);
-      if (_input_type == library_data_t::real_float &&
-          _output_type == library_data_t::complex_float)
-        _direction = fft_direction::forward;
-      else
-        _direction = fft_direction::backward;
-      _desc_sr->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                          _batch);
-#ifdef __INTEL_MKL__
-      if (_is_user_specified_dir_and_placement && _is_inplace) {
-        _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            DFTI_CONFIG_VALUE::DFTI_INPLACE);
-        set_stride_and_distance_basic<true>(_desc_sr);
-      } else {
-        _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
-        set_stride_and_distance_basic<false>(_desc_sr);
-      }
-      if (_use_external_workspace) {
-        if (_q->get_device().is_gpu()) {
-          _desc_sr->set_value(
-              oneapi::mkl::dft::config_param::WORKSPACE,
-              oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
-        }
-      }
-      if (_is_estimate_call) {
-        if (_q->get_device().is_gpu()) {
-          _desc_sr->get_value(
-              oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,
-              &_workspace_estimate_bytes);
-        }
-      } else {
-        _desc_sr->commit(*_q);
-        if (_q->get_device().is_gpu()) {
-          _desc_sr->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
-                              &_workspace_bytes);
-        }
-      }
-#else
-      if (_is_user_specified_dir_and_placement && _is_inplace) {
-        _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            oneapi::mkl::dft::config_value::INPLACE);
-        set_stride_and_distance_basic<true>(_desc_sr);
-      } else {
-        _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            oneapi::mkl::dft::config_value::NOT_INPLACE);
-        set_stride_and_distance_basic<false>(_desc_sr);
-      }
-      _desc_sr->commit(*_q);
-#endif
-    } else if ((_input_type == library_data_t::real_double &&
-                _output_type == library_data_t::complex_double) ||
-               (_input_type == library_data_t::complex_double &&
-                _output_type == library_data_t::real_double)) {
-      _desc_dr = std::make_shared<oneapi::mkl::dft::descriptor<
-          oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL>>(
-          _n);
-      if (_input_type == library_data_t::real_double &&
-          _output_type == library_data_t::complex_double)
-        _direction = fft_direction::forward;
-      else
-        _direction = fft_direction::backward;
-      _desc_dr->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,
-                          _batch);
-#ifdef __INTEL_MKL__
-      if (_is_user_specified_dir_and_placement && _is_inplace) {
-        _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            DFTI_CONFIG_VALUE::DFTI_INPLACE);
-        set_stride_and_distance_basic<true>(_desc_dr);
-      } else {
-        _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
-        set_stride_and_distance_basic<false>(_desc_dr);
-      }
-      if (_use_external_workspace) {
-        if (_q->get_device().is_gpu()) {
-          _desc_dr->set_value(
-              oneapi::mkl::dft::config_param::WORKSPACE,
-              oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);
-        }
-      }
-      if (_is_estimate_call) {
-        if (_q->get_device().is_gpu()) {
-          _desc_dr->get_value(
-              oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,
-              &_workspace_estimate_bytes);
-        }
-      } else {
-        _desc_dr->commit(*_q);
-        if (_q->get_device().is_gpu()) {
-          _desc_dr->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,
-                              &_workspace_bytes);
-        }
-      }
-#else
-      if (_is_user_specified_dir_and_placement && _is_inplace) {
-        _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            oneapi::mkl::dft::config_value::INPLACE);
-        set_stride_and_distance_basic<true>(_desc_dr);
-      } else {
-        _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                            oneapi::mkl::dft::config_value::NOT_INPLACE);
-        set_stride_and_distance_basic<false>(_desc_dr);
-      }
-      _desc_dr->commit(*_q);
-#endif
-    } else {
-      throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
-                            "invalid fft type");
-    }
-  }
-
-  void config_and_commit_advanced() {
-#ifdef __INTEL_MKL__
-#define CONFIG_AND_COMMIT(DESC, PREC, DOM, TYPE)                               \
-  {                                                                            \
-    DESC = std::make_shared<oneapi::mkl::dft::descriptor<                      \
-        oneapi::mkl::dft::precision::PREC, oneapi::mkl::dft::domain::DOM>>(    \
-        _n);                                                                   \
-    set_stride_advanced(DESC);                                                 \
-    DESC->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, _fwd_dist);  \
-    DESC->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, _bwd_dist);  \
-    DESC->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,      \
-                    _batch);                                                   \
-    if (_is_user_specified_dir_and_placement && _is_inplace)                   \
-      DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT,               \
-                      DFTI_CONFIG_VALUE::DFTI_INPLACE);                        \
-    else                                                                       \
-      DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT,               \
-                      DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);                    \
-    if (_use_external_workspace) {                                             \
-      DESC->set_value(oneapi::mkl::dft::config_param::WORKSPACE,               \
-                      oneapi::mkl::dft::config_value::WORKSPACE_EXTERNAL);     \
-    }                                                                          \
-    if (_is_estimate_call) {                                                   \
-      if (_q->get_device().is_gpu()) {                                         \
-        DESC->get_value(                                                       \
-            oneapi::mkl::dft::config_param::WORKSPACE_ESTIMATE_BYTES,          \
-            &_workspace_estimate_bytes);                                       \
-      }                                                                        \
-    } else {                                                                   \
-      DESC->commit(*_q);                                                       \
-      if (_is_estimate_call) {                                                 \
-        DESC->get_value(oneapi::mkl::dft::config_param::WORKSPACE_BYTES,       \
-                        &_workspace_bytes);                                    \
-      }                                                                        \
-    }                                                                          \
-  }
-#else
-#define CONFIG_AND_COMMIT(DESC, PREC, DOM, TYPE)                               \
-  {                                                                            \
-    DESC = std::make_shared<oneapi::mkl::dft::descriptor<                      \
-        oneapi::mkl::dft::precision::PREC, oneapi::mkl::dft::domain::DOM>>(    \
-        _n);                                                                   \
-    set_stride_advanced(DESC);                                                 \
-    DESC->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, _fwd_dist);  \
-    DESC->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, _bwd_dist);  \
-    DESC->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS,      \
-                    _batch);                                                   \
-    if (_is_user_specified_dir_and_placement && _is_inplace)                   \
-      DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT,               \
-                      oneapi::mkl::dft::config_value::INPLACE);                \
-    else                                                                       \
-      DESC->set_value(oneapi::mkl::dft::config_param::PLACEMENT,               \
-                      oneapi::mkl::dft::config_value::NOT_INPLACE);            \
-    DESC->commit(*_q);                                                         \
-  }
-#endif
-
-    if (_input_type == library_data_t::complex_float &&
-        _output_type == library_data_t::complex_float) {
-      CONFIG_AND_COMMIT(_desc_sc, SINGLE, COMPLEX, float);
-    } else if (_input_type == library_data_t::complex_double &&
-               _output_type == library_data_t::complex_double) {
-      CONFIG_AND_COMMIT(_desc_dc, DOUBLE, COMPLEX, double);
-    } else if ((_input_type == library_data_t::real_float &&
-                _output_type == library_data_t::complex_float) ||
-               (_input_type == library_data_t::complex_float &&
-                _output_type == library_data_t::real_float)) {
-      CONFIG_AND_COMMIT(_desc_sr, SINGLE, REAL, float);
-    } else if ((_input_type == library_data_t::real_double &&
-                _output_type == library_data_t::complex_double) ||
-               (_input_type == library_data_t::complex_double &&
-                _output_type == library_data_t::real_double)) {
-      CONFIG_AND_COMMIT(_desc_dr, DOUBLE, REAL, double);
-    } else {
-      throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
-                            "invalid fft type");
-    }
-#undef CONFIG_AND_COMMIT
-  }
-
-  template <typename T>
-  void init(int dim, T *n, T *inembed, T istride, T idist,
-            library_data_t input_type, T *onembed, T ostride, T odist,
-            library_data_t output_type, T batch,
-            std::optional<std::pair<fft_direction, bool /*is_inplace*/>>
-                direction_and_placement) {
-    if (direction_and_placement.has_value()) {
-      _is_user_specified_dir_and_placement = true;
-      _direction = direction_and_placement->first;
-      _is_inplace = direction_and_placement->second;
-    }
-    _n.resize(dim);
-    _inembed.resize(dim);
-    _onembed.resize(dim);
-    _input_type = input_type;
-    _output_type = output_type;
-    for (int i = 0; i < dim; i++) {
-      _n[i] = n[i];
-    }
-    if (inembed && onembed) {
-      for (int i = 0; i < dim; i++) {
-        _inembed[i] = inembed[i];
-        _onembed[i] = onembed[i];
-      }
-      _istride = istride;
-      _ostride = ostride;
-
-      if ((_input_type == library_data_t::real_float &&
-           _output_type == library_data_t::complex_float) ||
-          (_input_type == library_data_t::real_double &&
-           _output_type == library_data_t::complex_double)) {
-        _fwd_dist = idist;
-        _bwd_dist = odist;
-      } else if ((_output_type == library_data_t::real_float &&
-                  _input_type == library_data_t::complex_float) ||
-                 (_output_type == library_data_t::real_double &&
-                  _input_type == library_data_t::complex_double)) {
-        _fwd_dist = odist;
-        _bwd_dist = idist;
-      } else {
-        if (_is_user_specified_dir_and_placement &&
-            (_direction == fft_direction::backward)) {
-          _fwd_dist = odist;
-          _bwd_dist = idist;
-        } else {
-          _fwd_dist = idist;
-          _bwd_dist = odist;
-        }
-      }
-    } else {
-      _is_basic = true;
-    }
-    _batch = batch;
-    _dim = dim;
-
-    if (_is_basic)
-      config_and_commit_basic();
-    else
-      config_and_commit_advanced();
-  }
-  template <class Desc_t>
-  void set_stride_advanced(std::shared_ptr<Desc_t> desc) {
-    if (_dim == 1) {
-      std::int64_t input_stride[2] = {0, _istride};
-      std::int64_t output_stride[2] = {0, _ostride};
-      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,
-                      input_stride);
-      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
-                      output_stride);
-    } else if (_dim == 2) {
-      std::int64_t input_stride[3] = {0, _inembed[1] * _istride, _istride};
-      std::int64_t output_stride[3] = {0, _onembed[1] * _ostride, _ostride};
-      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,
-                      input_stride);
-      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
-                      output_stride);
-    } else if (_dim == 3) {
-      std::int64_t input_stride[4] = {0, _inembed[2] * _inembed[1] * _istride,
-                                      _inembed[2] * _istride, _istride};
-      std::int64_t output_stride[4] = {0, _onembed[2] * _onembed[1] * _ostride,
-                                       _onembed[2] * _ostride, _ostride};
-      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,
-                      input_stride);
-      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,
-                      output_stride);
-    }
-  }
-
-  template <class Desc_t> void swap_distance(std::shared_ptr<Desc_t> desc) {
-    desc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, _bwd_dist);
-    desc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, _fwd_dist);
-    std::int64_t temp = _bwd_dist;
-    _bwd_dist = _fwd_dist;
-    _fwd_dist = temp;
-  }
-
-  template <bool Is_inplace, class Desc_t>
-  void set_stride_and_distance_basic(std::shared_ptr<Desc_t> desc) {
-    std::int64_t forward_distance = 0;
-    std::int64_t backward_distance = 0;
-
-#define SET_STRIDE                                                             \
-  {                                                                            \
-    if (_direction == fft_direction::forward) {                                \
-      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,           \
-                      real_stride);                                            \
-      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,          \
-                      complex_stride);                                         \
-    } else {                                                                   \
-      desc->set_value(oneapi::mkl::dft::config_param::INPUT_STRIDES,           \
-                      complex_stride);                                         \
-      desc->set_value(oneapi::mkl::dft::config_param::OUTPUT_STRIDES,          \
-                      real_stride);                                            \
-    }                                                                          \
-  }
-    if (_dim == 1) {
-      if constexpr (Is_inplace) {
-        std::int64_t real_stride[2] = {0, 1};
-        std::int64_t complex_stride[2] = {0, 1};
-        SET_STRIDE;
-        forward_distance = 2 * (_n[0] / 2 + 1);
-        backward_distance = _n[0] / 2 + 1;
-      } else {
-        std::int64_t real_stride[2] = {0, 1};
-        std::int64_t complex_stride[2] = {0, 1};
-        SET_STRIDE;
-        forward_distance = _n[0];
-        backward_distance = _n[0] / 2 + 1;
-      }
-    } else if (_dim == 2) {
-      if constexpr (Is_inplace) {
-        std::int64_t complex_stride[3] = {0, _n[1] / 2 + 1, 1};
-        std::int64_t real_stride[3] = {0, 2 * (_n[1] / 2 + 1), 1};
-        SET_STRIDE;
-        forward_distance = _n[0] * 2 * (_n[1] / 2 + 1);
-        backward_distance = _n[0] * (_n[1] / 2 + 1);
-      } else {
-        std::int64_t complex_stride[3] = {0, _n[1] / 2 + 1, 1};
-        std::int64_t real_stride[3] = {0, _n[1], 1};
-        SET_STRIDE;
-        forward_distance = _n[0] * _n[1];
-        backward_distance = _n[0] * (_n[1] / 2 + 1);
-      }
-    } else if (_dim == 3) {
-      if constexpr (Is_inplace) {
-        std::int64_t complex_stride[4] = {0, _n[1] * (_n[2] / 2 + 1),
-                                          _n[2] / 2 + 1, 1};
-        std::int64_t real_stride[4] = {0, _n[1] * 2 * (_n[2] / 2 + 1),
-                                       2 * (_n[2] / 2 + 1), 1};
-        SET_STRIDE;
-        forward_distance = _n[0] * _n[1] * 2 * (_n[2] / 2 + 1);
-        backward_distance = _n[0] * _n[1] * (_n[2] / 2 + 1);
-      } else {
-        std::int64_t complex_stride[4] = {0, _n[1] * (_n[2] / 2 + 1),
-                                          _n[2] / 2 + 1, 1};
-        std::int64_t real_stride[4] = {0, _n[1] * _n[2], _n[2], 1};
-        SET_STRIDE;
-        forward_distance = _n[0] * _n[1] * _n[2];
-        backward_distance = _n[0] * _n[1] * (_n[2] / 2 + 1);
-      }
-    }
-#undef SET_STRIDE
-    desc->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,
-                    forward_distance);
-    desc->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,
-                    backward_distance);
-  }
-
-#define COMPUTE(DESC)                                                          \
-  {                                                                            \
-    if (_is_inplace) {                                                         \
-      auto data_input = dpct::detail::get_memory<T>(input);                    \
-      if (_direction == fft_direction::forward) {                              \
-        oneapi::mkl::dft::compute_forward<                                     \
-            std::remove_reference_t<decltype(*DESC)>, T>(*DESC, data_input);   \
-      } else {                                                                 \
-        oneapi::mkl::dft::compute_backward<                                    \
-            std::remove_reference_t<decltype(*DESC)>, T>(*DESC, data_input);   \
-      }                                                                        \
-    } else {                                                                   \
-      auto data_input = dpct::detail::get_memory<T>(input);                    \
-      auto data_output = dpct::detail::get_memory<T>(output);                  \
-      if (_direction == fft_direction::forward) {                              \
-        oneapi::mkl::dft::compute_forward<                                     \
-            std::remove_reference_t<decltype(*DESC)>, T, T>(*DESC, data_input, \
-                                                            data_output);      \
-      } else {                                                                 \
-        oneapi::mkl::dft::compute_backward<                                    \
-            std::remove_reference_t<decltype(*DESC)>, T, T>(*DESC, data_input, \
-                                                            data_output);      \
-      }                                                                        \
-    }                                                                          \
-  }
-
-  template <class T, oneapi::mkl::dft::precision Precision>
-  void compute_complex(T *input, T *output, fft_direction direction) {
-    bool is_this_compute_inplace = input == output;
-
-    if (!_is_user_specified_dir_and_placement) {
-      // The complex domain descriptor need different config values if the
-      // FFT direction or placement is different.
-      // Here we check the conditions, and new config values are set and
-      // re-committed if needed.
-      if (direction != _direction || is_this_compute_inplace != _is_inplace) {
-        if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) {
-          if (direction != _direction) {
-            swap_distance(_desc_sc);
-            _direction = direction;
-          }
-          if (is_this_compute_inplace != _is_inplace) {
-            _is_inplace = is_this_compute_inplace;
-#ifdef __INTEL_MKL__
-            if (_is_inplace) {
-              _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                  DFTI_CONFIG_VALUE::DFTI_INPLACE);
-            } else {
-              _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                  DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
-            }
-#else
-            if (_is_inplace) {
-              _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                  oneapi::mkl::dft::config_value::INPLACE);
-            } else {
-              _desc_sc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                  oneapi::mkl::dft::config_value::NOT_INPLACE);
-            }
-#endif
-          }
-          _desc_sc->commit(*_q);
-        } else {
-          if (direction != _direction) {
-            swap_distance(_desc_dc);
-            _direction = direction;
-          }
-          if (is_this_compute_inplace != _is_inplace) {
-            _is_inplace = is_this_compute_inplace;
-#ifdef __INTEL_MKL__
-            if (_is_inplace) {
-              _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                  DFTI_CONFIG_VALUE::DFTI_INPLACE);
-            } else {
-              _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                  DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
-            }
-#else
-            if (_is_inplace) {
-              _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                  oneapi::mkl::dft::config_value::INPLACE);
-            } else {
-              _desc_dc->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                  oneapi::mkl::dft::config_value::NOT_INPLACE);
-            }
-#endif
-          }
-          _desc_dc->commit(*_q);
-        }
-      }
-    }
-
-    if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) {
-      COMPUTE(_desc_sc);
-    } else {
-      COMPUTE(_desc_dc);
-    }
-  }
-
-  template <class T, oneapi::mkl::dft::precision Precision>
-  void compute_real(T *input, T *output) {
-    bool is_this_compute_inplace = input == output;
-
-    if (!_is_user_specified_dir_and_placement) {
-      // The real domain descriptor need different config values if the
-      // FFT placement is different.
-      // Here we check the condition, and new config values are set and
-      // re-committed if needed.
-      if (is_this_compute_inplace != _is_inplace) {
-        if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) {
-          _is_inplace = is_this_compute_inplace;
-          if (_is_inplace) {
-#ifdef __INTEL_MKL__
-            _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                DFTI_CONFIG_VALUE::DFTI_INPLACE);
-#else
-            _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                oneapi::mkl::dft::config_value::INPLACE);
-#endif
-            if (_is_basic)
-              set_stride_and_distance_basic<true>(_desc_sr);
-          } else {
-#ifdef __INTEL_MKL__
-            _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
-#else
-            _desc_sr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                oneapi::mkl::dft::config_value::NOT_INPLACE);
-#endif
-            if (_is_basic)
-              set_stride_and_distance_basic<false>(_desc_sr);
-          }
-          _desc_sr->commit(*_q);
-        } else {
-          _is_inplace = is_this_compute_inplace;
-          if (_is_inplace) {
-#ifdef __INTEL_MKL__
-            _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                DFTI_CONFIG_VALUE::DFTI_INPLACE);
-#else
-            _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                oneapi::mkl::dft::config_value::INPLACE);
-#endif
-            if (_is_basic)
-              set_stride_and_distance_basic<true>(_desc_dr);
-          } else {
-#ifdef __INTEL_MKL__
-            _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                DFTI_CONFIG_VALUE::DFTI_NOT_INPLACE);
-#else
-            _desc_dr->set_value(oneapi::mkl::dft::config_param::PLACEMENT,
-                                oneapi::mkl::dft::config_value::NOT_INPLACE);
-#endif
-            if (_is_basic)
-              set_stride_and_distance_basic<false>(_desc_dr);
-          }
-          _desc_dr->commit(*_q);
-        }
-      }
-    }
-
-    if constexpr (Precision == oneapi::mkl::dft::precision::SINGLE) {
-      COMPUTE(_desc_sr);
-    } else {
-      COMPUTE(_desc_dr);
-    }
-  }
-#undef COMPUTE
-
-private:
-  sycl::queue *_q = nullptr;
-  int _dim;
-  std::vector<std::int64_t> _n;
-  std::vector<std::int64_t> _inembed;
-  std::int64_t _istride;
-  std::int64_t _fwd_dist;
-  library_data_t _input_type;
-  std::vector<std::int64_t> _onembed;
-  std::int64_t _ostride;
-  std::int64_t _bwd_dist;
-  library_data_t _output_type;
-  std::int64_t _batch = 1;
-  bool _is_basic = false;
-  bool _is_inplace = false;
-  fft_direction _direction = fft_direction::forward;
-  bool _is_user_specified_dir_and_placement = false;
-  bool _use_external_workspace = false;
-  void *_external_workspace_ptr = nullptr;
-  size_t _workspace_bytes = 0;
-  bool _is_estimate_call = false;
-  size_t _workspace_estimate_bytes = 0;
-  std::shared_ptr<oneapi::mkl::dft::descriptor<
-      oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL>>
-      _desc_sr;
-  std::shared_ptr<oneapi::mkl::dft::descriptor<
-      oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL>>
-      _desc_dr;
-  std::shared_ptr<oneapi::mkl::dft::descriptor<
-      oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::COMPLEX>>
-      _desc_sc;
-  std::shared_ptr<oneapi::mkl::dft::descriptor<
-      oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::COMPLEX>>
-      _desc_dc;
-};
-
-using fft_engine_ptr = fft_engine *;
-} // namespace fft
-} // namespace dpct
-
-#endif // __DPCT_FFT_UTILS_HPP__
diff --git a/dpct/image.hpp b/dpct/image.hpp
deleted file mode 100644
index b9bb246685e7b..0000000000000
--- a/dpct/image.hpp
+++ /dev/null
@@ -1,901 +0,0 @@
-//==---- image.hpp --------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_IMAGE_HPP__
-#define __DPCT_IMAGE_HPP__
-
-#include <sycl/sycl.hpp>
-
-#include "memory.hpp"
-#include "util.hpp"
-
-namespace dpct {
-
-enum class image_channel_data_type {
-  signed_int,
-  unsigned_int,
-  fp,
-};
-
-class image_channel;
-class image_wrapper_base;
-namespace detail {
-/// Image object type traits, with accessor type and sampled data type defined.
-/// The data type of an image accessor must be one of sycl::int4, sycl::uint4,
-/// sycl::float4 and sycl::half4. The data type of accessors with 8bits/16bits
-/// channel width will be 32 bits. sycl::half is an exception.
-template <class T> struct image_trait {
-  using acc_data_t = sycl::vec<T, 4>;
-  template <int dimensions>
-  using accessor_t =
-      sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
-                         sycl::access::target::image>;
-  template <int dimensions>
-  using array_accessor_t =
-      sycl::accessor<acc_data_t, dimensions, sycl::access_mode::read,
-                         sycl::access::target::image_array>;
-  using data_t = T;
-  using elem_t = T;
-  static constexpr image_channel_data_type data_type =
-      std::is_integral<T>::value
-          ? (std::is_signed<T>::value ? image_channel_data_type::signed_int
-                                      : image_channel_data_type::unsigned_int)
-          : image_channel_data_type::fp;
-  static constexpr int channel_num = 1;
-};
-template <>
-struct image_trait<std::uint8_t> : public image_trait<std::uint32_t> {
-  using data_t = std::uint8_t;
-  using elem_t = data_t;
-};
-template <>
-struct image_trait<std::uint16_t>
-    : public image_trait<std::uint32_t> {
-  using data_t = std::uint16_t;
-  using elem_t = data_t;
-};
-template <>
-struct image_trait<std::int8_t> : public image_trait<std::int32_t> {
-  using data_t = std::int8_t;
-  using elem_t = data_t;
-};
-template <>
-struct image_trait<std::int16_t> : public image_trait<std::int32_t> {
-  using data_t = std::int16_t;
-  using elem_t = data_t;
-};
-template <>
-struct image_trait<char>
-    : public image_trait<typename std::conditional<
-          std::is_signed<char>::value, signed char, unsigned char>::type> {};
-
-template <class T>
-struct image_trait<sycl::vec<T, 1>> : public image_trait<T> {};
-
-template <class T>
-struct image_trait<sycl::vec<T, 2>> : public image_trait<T> {
-  using data_t = sycl::vec<T, 2>;
-  static constexpr int channel_num = 2;
-};
-
-template <class T>
-struct image_trait<sycl::vec<T, 3>>
-    : public image_trait<sycl::vec<T, 4>> {
-  static constexpr int channel_num = 3;
-};
-
-template <class T>
-struct image_trait<sycl::vec<T, 4>> : public image_trait<T> {
-  using data_t = sycl::vec<T, 4>;
-  static constexpr int channel_num = 4;
-};
-
-/// Functor to fetch data from read result of an image accessor.
-template <class T> struct fetch_data {
-  using return_t = typename image_trait<T>::data_t;
-  using acc_data_t = typename image_trait<T>::acc_data_t;
-
-  return_t operator()(acc_data_t &&original_data) {
-    return (return_t)original_data.r();
-  }
-};
-template <class T>
-struct fetch_data<sycl::vec<T, 1>> : public fetch_data<T> {};
-template <class T> struct fetch_data<sycl::vec<T, 2>> {
-  using return_t = typename image_trait<sycl::vec<T, 2>>::data_t;
-  using acc_data_t = typename image_trait<sycl::vec<T, 2>>::acc_data_t;
-
-  return_t operator()(acc_data_t &&origin_data) {
-    return return_t(origin_data.r(), origin_data.g());
-  }
-};
-template <class T>
-struct fetch_data<sycl::vec<T, 3>>
-    : public fetch_data<sycl::vec<T, 4>> {};
-template <class T> struct fetch_data<sycl::vec<T, 4>> {
-  using return_t = typename image_trait<sycl::vec<T, 4>>::data_t;
-  using acc_data_t = typename image_trait<sycl::vec<T, 4>>::acc_data_t;
-
-  return_t operator()(acc_data_t &&origin_data) {
-    return return_t(origin_data.r(), origin_data.g(), origin_data.b(),
-                    origin_data.a());
-  }
-};
-
-/// Create image according with given type \p T and \p dims.
-template <class T> static image_wrapper_base *create_image_wrapper(int dims);
-
-/// Create image with given data type \p T, channel order and dims
-template <class T>
-static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims);
-
-/// Create image with channel info and specified dimensions.
-static image_wrapper_base *create_image_wrapper(image_channel channel, int dims);
-
-} // namespace detail
-
-/// Image channel info, include channel number, order, data width and type
-class image_channel {
-  image_channel_data_type _type = image_channel_data_type::signed_int;
-  /// Number of channels.
-  unsigned _channel_num = 0;
-  /// Total size of all channels in bytes.
-  unsigned _total_size = 0;
-  /// Size of each channel in bytes.
-  unsigned _channel_size = 0;
-
-public:
-  /// Create image channel info according to template argument \p T.
-  template <class T> static image_channel create() {
-    image_channel channel;
-    channel.set_channel_size(detail::image_trait<T>::channel_num,
-                             sizeof(typename detail::image_trait<T>::elem_t) *
-                                 8);
-    channel.set_channel_data_type(detail::image_trait<T>::data_type);
-    return channel;
-  }
-
-  image_channel() = default;
-
-  image_channel_data_type get_channel_data_type() { return _type; }
-  void set_channel_data_type(image_channel_data_type type) { _type = type; }
-
-  unsigned get_total_size() { return _total_size; }
-
-  unsigned get_channel_num() { return _channel_num; }
-  void set_channel_num(unsigned channel_num) {
-    _channel_num = channel_num;
-    _total_size = _channel_size * _channel_num;
-  }
-
-  /// image_channel constructor.
-  /// \param r Channel r width in bits.
-  /// \param g Channel g width in bits. Should be same with \p r, or zero.
-  /// \param b Channel b width in bits. Should be same with \p g, or zero.
-  /// \param a Channel a width in bits. Should be same with \p b, or zero.
-  /// \param data_type Image channel data type: signed_nt, unsigned_int or fp.
-  image_channel(int r, int g, int b, int a, image_channel_data_type data_type) {
-    _type = data_type;
-    if (a) {
-      assert(r == a && "SYCL doesn't support different channel size");
-      assert(r == b && "SYCL doesn't support different channel size");
-      assert(r == g && "SYCL doesn't support different channel size");
-      set_channel_size(4, a);
-    } else if (b) {
-      assert(r == b && "SYCL doesn't support different channel size");
-      assert(r == g && "SYCL doesn't support different channel size");
-      set_channel_size(3, b);
-    } else if (g) {
-      assert(r == g && "SYCL doesn't support different channel size");
-      set_channel_size(2, g);
-    } else {
-      set_channel_size(1, r);
-    }
-  }
-
-  sycl::image_channel_type get_channel_type() const {
-    if (_channel_size == 4) {
-      if (_type == image_channel_data_type::signed_int)
-        return sycl::image_channel_type::signed_int32;
-      else if (_type == image_channel_data_type::unsigned_int)
-        return sycl::image_channel_type::unsigned_int32;
-      else if (_type == image_channel_data_type::fp)
-        return sycl::image_channel_type::fp32;
-    } else if (_channel_size == 2) {
-      if (_type == image_channel_data_type::signed_int)
-        return sycl::image_channel_type::signed_int16;
-      else if (_type == image_channel_data_type::unsigned_int)
-        return sycl::image_channel_type::unsigned_int16;
-      else if (_type == image_channel_data_type::fp)
-        return sycl::image_channel_type::fp16;
-    } else {
-      if (_type == image_channel_data_type::signed_int)
-        return sycl::image_channel_type::signed_int8;
-      else if (_type == image_channel_data_type::unsigned_int)
-        return sycl::image_channel_type::unsigned_int8;
-    }
-    assert(false && "unexpected channel data kind and channel size");
-    return sycl::image_channel_type::signed_int32;
-  }
-  void set_channel_type(sycl::image_channel_type type) {
-    switch (type) {
-    case sycl::image_channel_type::unsigned_int8:
-      _type = image_channel_data_type::unsigned_int;
-      _channel_size = 1;
-      break;
-    case sycl::image_channel_type::unsigned_int16:
-      _type = image_channel_data_type::unsigned_int;
-      _channel_size = 2;
-      break;
-    case sycl::image_channel_type::unsigned_int32:
-      _type = image_channel_data_type::unsigned_int;
-      _channel_size = 4;
-      break;
-    case sycl::image_channel_type::signed_int8:
-      _type = image_channel_data_type::signed_int;
-      _channel_size = 1;
-      break;
-    case sycl::image_channel_type::signed_int16:
-      _type = image_channel_data_type::signed_int;
-      _channel_size = 2;
-      break;
-    case sycl::image_channel_type::signed_int32:
-      _type = image_channel_data_type::signed_int;
-      _channel_size = 4;
-      break;
-    case sycl::image_channel_type::fp16:
-      _type = image_channel_data_type::fp;
-      _channel_size = 2;
-      break;
-    case sycl::image_channel_type::fp32:
-      _type = image_channel_data_type::fp;
-      _channel_size = 4;
-      break;
-    default:
-      break;
-    }
-    _total_size = _channel_size * _channel_num;
-  }
-
-  sycl::image_channel_order get_channel_order() const {
-    switch (_channel_num) {
-    case 1:
-      return sycl::image_channel_order::r;
-    case 2:
-      return sycl::image_channel_order::rg;
-    case 3:
-      return sycl::image_channel_order::rgb;
-    case 4:
-      return sycl::image_channel_order::rgba;
-    default:
-      return sycl::image_channel_order::r;
-    }
-  }
-  /// Get the size for each channel in bits.
-  unsigned get_channel_size() const { return _channel_size * 8; }
-
-  /// Set channel size.
-  /// \param in_channel_num Channels number to set.
-  /// \param channel_size Size for each channel in bits.
-  void set_channel_size(unsigned in_channel_num,
-                        unsigned channel_size) {
-    if (in_channel_num < _channel_num)
-      return;
-    _channel_num = in_channel_num;
-    _channel_size = channel_size / 8;
-    _total_size = _channel_size * _channel_num;
-  }
-};
-
-/// 2D or 3D matrix data for image.
-class image_matrix {
-  image_channel _channel;
-  int _range[3] = {1, 1, 1};
-  int _dims = 0;
-  void *_host_data = nullptr;
-
-  /// Set range of each dimension.
-  template <int dimensions> void set_range(sycl::range<dimensions> range) {
-    for (int i = 0; i < dimensions; ++i)
-      _range[i] = range[i];
-    _dims = dimensions;
-  }
-
-  template <int... DimIdx>
-  sycl::range<sizeof...(DimIdx)> get_range(integer_sequence<DimIdx...>) {
-    return sycl::range<sizeof...(DimIdx)>(_range[DimIdx]...);
-  }
-
-public:
-  /// Constructor with channel info and dimension size info.
-  template <int dimensions>
-  image_matrix(image_channel channel, sycl::range<dimensions> range)
-      : _channel(channel) {
-    set_range(range);
-    _host_data = std::malloc(range.size() * _channel.get_total_size());
-  }
-  image_matrix(sycl::image_channel_type channel_type, unsigned channel_num,
-               size_t x, size_t y) {
-    _channel.set_channel_type(channel_type);
-    _channel.set_channel_num(channel_num);
-    _dims = 1;
-    _range[0] = x;
-    if (y) {
-      _dims = 2;
-      _range[1] = y;
-    }
-    _host_data = std::malloc(_range[0] * _range[1] * _channel.get_total_size());
-  }
-
-  /// Construct a new image class with the matrix data.
-  template <int dimensions> sycl::image<dimensions> *create_image() {
-    return create_image<dimensions>(_channel);
-  }
-  /// Construct a new image class with the matrix data.
-  template <int dimensions>
-  sycl::image<dimensions> *create_image(image_channel channel) {
-    return new sycl::image<dimensions>(
-        _host_data, channel.get_channel_order(), channel.get_channel_type(),
-        get_range(make_index_sequence<dimensions>()),
-        sycl::property::image::use_host_ptr());
-  }
-
-  /// Get channel info.
-  inline image_channel get_channel() { return _channel; }
-  /// Get range of the image.
-  sycl::range<3> get_range() {
-    return sycl::range<3>(_range[0], _range[1], _range[2]);
-  }
-  /// Get matrix dims.
-  inline int get_dims() { return _dims; }
-  /// Convert to pitched data.
-  pitched_data to_pitched_data() {
-    return pitched_data(_host_data, _range[0] * _channel.get_total_size(),
-                        _range[0], _range[1]);
-  }
-
-  ~image_matrix() {
-    if (_host_data)
-      std::free(_host_data);
-    _host_data = nullptr;
-  }
-};
-using image_matrix_p = image_matrix *;
-
-enum class image_data_type { matrix, linear, pitch, unsupport };
-
-/// Image data info.
-class image_data {
-public:
-  image_data() { _type = image_data_type::unsupport; }
-  image_data(image_matrix_p matrix_data) { set_data(matrix_data); }
-  image_data(void *data_ptr, size_t x_size, image_channel channel) {
-    set_data(data_ptr, x_size, channel);
-  }
-  image_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
-             image_channel channel) {
-    set_data(data_ptr, x_size, y_size, pitch_size, channel);
-  }
-  void set_data(image_matrix_p matrix_data) {
-    _type = image_data_type::matrix;
-    _data = matrix_data;
-    _channel = matrix_data->get_channel();
-  }
-  void set_data(void *data_ptr, size_t x_size, image_channel channel) {
-    _type = image_data_type::linear;
-    _data = data_ptr;
-    _x = x_size;
-    _channel = channel;
-  }
-  void set_data(void *data_ptr, size_t x_size, size_t y_size, size_t pitch_size,
-                image_channel channel) {
-    _type = image_data_type::pitch;
-    _data = data_ptr;
-    _x = x_size;
-    _y = y_size;
-    _pitch = pitch_size;
-    _channel = channel;
-  }
-
-  image_data_type get_data_type() const { return _type; }
-  void set_data_type(image_data_type type) { _type = type; }
-
-  void *get_data_ptr() const { return _data; }
-  void set_data_ptr(void *data) { _data = data; }
-
-  size_t get_x() const { return _x; }
-  void set_x(size_t x) { _x = x; }
-
-  size_t get_y() const { return _y; }
-  void set_y(size_t y) { _y = y; }
-
-  size_t get_pitch() const { return _pitch; }
-  void set_pitch(size_t pitch) { _pitch = pitch; }
-
-  image_channel get_channel() const { return _channel; }
-  void set_channel(image_channel channel) { _channel = channel; }
-
-  image_channel_data_type get_channel_data_type() {
-    return _channel.get_channel_data_type();
-  }
-  void set_channel_data_type(image_channel_data_type type) {
-    _channel.set_channel_data_type(type);
-  }
-
-  unsigned get_channel_size() { return _channel.get_channel_size(); }
-  void set_channel_size(unsigned channel_num, unsigned channel_size) {
-    return _channel.set_channel_size(channel_num, channel_size);
-  }
-
-  unsigned get_channel_num() { return _channel.get_channel_num(); }
-  void set_channel_num(unsigned num) {
-    return _channel.set_channel_num(num);
-  }
-
-  sycl::image_channel_type get_channel_type() {
-    return _channel.get_channel_type();
-  }
-  void set_channel_type(sycl::image_channel_type type) {
-    return _channel.set_channel_type(type);
-  }
-
-private:
-  image_data_type _type;
-  void *_data = nullptr;
-  size_t _x, _y, _pitch;
-  image_channel _channel;
-};
-
-/// Image sampling info, include addressing mode, filtering mode and
-/// normalization info.
-class sampling_info {
-  sycl::addressing_mode _addressing_mode =
-      sycl::addressing_mode::clamp_to_edge;
-  sycl::filtering_mode _filtering_mode = sycl::filtering_mode::nearest;
-  sycl::coordinate_normalization_mode _coordinate_normalization_mode =
-      sycl::coordinate_normalization_mode::unnormalized;
-
-public:
-  sycl::addressing_mode get_addressing_mode() { return _addressing_mode; }
-  void set(sycl::addressing_mode addressing_mode) { _addressing_mode = addressing_mode; }
-
-  sycl::filtering_mode get_filtering_mode() { return _filtering_mode; }
-  void set(sycl::filtering_mode filtering_mode) { _filtering_mode = filtering_mode; }
-
-  sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
-    return _coordinate_normalization_mode;
-  }
-  void set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
-    _coordinate_normalization_mode = coordinate_normalization_mode;
-  }
-
-  bool is_coordinate_normalized() {
-    return _coordinate_normalization_mode ==
-           sycl::coordinate_normalization_mode::normalized;
-  }
-  void set_coordinate_normalization_mode(int is_normalized) {
-    _coordinate_normalization_mode =
-        is_normalized ? sycl::coordinate_normalization_mode::normalized
-                      : sycl::coordinate_normalization_mode::unnormalized;
-  }
-  void
-  set(sycl::addressing_mode addressing_mode,
-      sycl::filtering_mode filtering_mode,
-      sycl::coordinate_normalization_mode coordinate_normalization_mode) {
-    set(addressing_mode);
-    set(filtering_mode);
-    set(coordinate_normalization_mode);
-  }
-  void set(sycl::addressing_mode addressing_mode,
-           sycl::filtering_mode filtering_mode, int is_normalized) {
-    set(addressing_mode);
-    set(filtering_mode);
-    set_coordinate_normalization_mode(is_normalized);
-  }
-
-  sycl::sampler get_sampler() {
-    return sycl::sampler(_coordinate_normalization_mode, _addressing_mode,
-                             _filtering_mode);
-  }
-};
-
-/// Image base class.
-class image_wrapper_base {
-  sampling_info _sampling_info;
-  image_data _data;
-
-public:
-  virtual ~image_wrapper_base() = 0;
-
-  void attach(image_data data) { set_data(data); }
-  /// Attach matrix data to this class.
-  void attach(image_matrix *matrix) {
-    detach();
-    image_wrapper_base::set_data(image_data(matrix));
-  }
-  /// Attach matrix data to this class.
-  void attach(image_matrix *matrix, image_channel channel) {
-    attach(matrix);
-    image_wrapper_base::set_channel(channel);
-  }
-  /// Attach linear data to this class.
-  void attach(const void *ptr, size_t count) {
-    attach(ptr, count, get_channel());
-  }
-  /// Attach linear data to this class.
-  void attach(const void *ptr, size_t count, image_channel channel) {
-    detach();
-    image_wrapper_base::set_data(image_data(const_cast<void *>(ptr), count, channel));
-  }
-  /// Attach 2D data to this class.
-  void attach(const void *data, size_t x, size_t y, size_t pitch) {
-    attach(data, x, y, pitch, get_channel());
-  }
-  /// Attach 2D data to this class.
-  void attach(const void *data, size_t x, size_t y, size_t pitch,
-              image_channel channel) {
-    detach();
-    image_wrapper_base::set_data(
-        image_data(const_cast<void *>(data), x, y, pitch, channel));
-  }
-  /// Detach data.
-  virtual void detach() {}
-
-  sampling_info get_sampling_info() { return _sampling_info; }
-  void set_sampling_info(sampling_info info) {
-    _sampling_info = info;
-  }
-  const image_data &get_data() { return _data; }
-  void set_data(image_data data) { _data = data; }
-
-  image_channel get_channel() { return _data.get_channel(); }
-  void set_channel(image_channel channel) { _data.set_channel(channel); }
-
-  image_channel_data_type get_channel_data_type() {
-    return _data.get_channel_data_type();
-  }
-  void set_channel_data_type(image_channel_data_type type) {
-    _data.set_channel_data_type(type);
-  }
-
-  unsigned get_channel_size() { return _data.get_channel_size(); }
-  void set_channel_size(unsigned channel_num, unsigned channel_size) {
-    return _data.set_channel_size(channel_num, channel_size);
-  }
-
-  sycl::addressing_mode get_addressing_mode() {
-    return _sampling_info.get_addressing_mode();
-  }
-  void set(sycl::addressing_mode addressing_mode) {
-    _sampling_info.set(addressing_mode);
-  }
-
-  sycl::filtering_mode get_filtering_mode() {
-    return _sampling_info.get_filtering_mode();
-  }
-  void set(sycl::filtering_mode filtering_mode) {
-    _sampling_info.set(filtering_mode);
-  }
-
-  sycl::coordinate_normalization_mode get_coordinate_normalization_mode() {
-    return _sampling_info.get_coordinate_normalization_mode();
-  }
-  void
-  set(sycl::coordinate_normalization_mode coordinate_normalization_mode) {
-    _sampling_info.set(coordinate_normalization_mode);
-  }
-
-  bool is_coordinate_normalized() {
-    return _sampling_info.is_coordinate_normalized();
-  }
-  void set_coordinate_normalization_mode(int is_normalized) {
-    _sampling_info.set_coordinate_normalization_mode(is_normalized);
-  }
-  void
-  set(sycl::addressing_mode addressing_mode,
-      sycl::filtering_mode filtering_mode,
-      sycl::coordinate_normalization_mode coordinate_normalization_mode) {
-    set(addressing_mode);
-    set(filtering_mode);
-    set(coordinate_normalization_mode);
-  }
-  void set(sycl::addressing_mode addressing_mode,
-           sycl::filtering_mode filtering_mode, int is_normalized) {
-    set(addressing_mode);
-    set(filtering_mode);
-    set_coordinate_normalization_mode(is_normalized);
-  }
-
-  unsigned get_channel_num() { return _data.get_channel_num(); }
-  void set_channel_num(unsigned num) {
-    return _data.set_channel_num(num);
-  }
-
-  sycl::image_channel_type get_channel_type() {
-    return _data.get_channel_type();
-  }
-  void set_channel_type(sycl::image_channel_type type) {
-    return _data.set_channel_type(type);
-  }
-
-  sycl::sampler get_sampler() {
-    sycl::sampler smp = _sampling_info.get_sampler();
-    /// linear memory only used for sycl::filtering_mode::nearest.
-    if (_data.get_data_type() == image_data_type::linear) {
-      smp = sycl::sampler(smp.get_coordinate_normalization_mode(),
-                          smp.get_addressing_mode(),
-                          sycl::filtering_mode::nearest);
-    }
-    return smp;
-  }
-};
-inline image_wrapper_base::~image_wrapper_base() {}
-using image_wrapper_base_p = image_wrapper_base *;
-
-template <class T, int dimensions, bool IsImageArray> class image_accessor_ext;
-
-/// Image class, wrapper of sycl::image.
-template <class T, int dimensions, bool IsImageArray = false> class image_wrapper : public image_wrapper_base {
-  sycl::image<dimensions> *_image = nullptr;
-
-#ifndef DPCT_USM_LEVEL_NONE
-  std::vector<char> _host_buffer;
-#endif
-
-  void create_image(sycl::queue q) {
-    auto &data = get_data();
-    if (data.get_data_type() == image_data_type::matrix) {
-      _image = static_cast<image_matrix_p>(data.get_data_ptr())
-          ->create_image<dimensions>(data.get_channel());
-      return;
-    }
-    auto ptr = data.get_data_ptr();
-    auto channel = data.get_channel();
-
-    if (detail::get_pointer_attribute(q, ptr) == detail::pointer_access_attribute::device_only) {
-#ifdef DPCT_USM_LEVEL_NONE
-      ptr = get_buffer(ptr)
-                .template get_access<sycl::access_mode::read_write>()
-                .get_pointer();
-#else
-      auto sz = data.get_x();
-      if (data.get_data_type() == image_data_type::pitch)
-        sz *= channel.get_total_size() * data.get_y();
-      _host_buffer.resize(sz);
-      q.memcpy(_host_buffer.data(), ptr, sz).wait();
-      ptr = _host_buffer.data();
-#endif
-    }
-
-    if constexpr (dimensions == 1) {
-      assert(data.get_data_type() == image_data_type::linear);
-      _image = new sycl::image<1>(
-        ptr, channel.get_channel_order(), channel.get_channel_type(),
-        sycl::range<1>(data.get_x() / channel.get_total_size()));
-    } else if constexpr (dimensions == 2) {
-      assert(data.get_data_type() == image_data_type::pitch);
-      _image = new sycl::image<2>(ptr, channel.get_channel_order(),
-                                  channel.get_channel_type(),
-                                  sycl::range<2>(data.get_x(), data.get_y()),
-                                  sycl::range<1>(data.get_pitch()));
-    } else {
-      throw std::runtime_error("3D image only support matrix data");
-    }
-    return;
-  }
-
-public:
-  using acc_data_t = typename detail::image_trait<T>::acc_data_t;
-  using accessor_t =
-      typename image_accessor_ext<T, IsImageArray ? (dimensions - 1) : dimensions,
-                              IsImageArray>::accessor_t;
-
-  image_wrapper() { set_channel(image_channel::create<T>()); }
-  ~image_wrapper() { detach(); }
-
-  /// Get image accessor.
-  accessor_t get_access(sycl::handler &cgh, sycl::queue &q = get_default_queue()) {
-    if (!_image)
-      create_image(q);
-    return accessor_t(*_image, cgh);
-  }
-
-  /// Detach data.
-  void detach() override {
-    if (_image)
-      delete _image;
-    _image = nullptr;
-  }
-};
-
-/// Wrap sampler and image accessor together.
-template <class T, int dimensions, bool IsImageArray = false>
-class image_accessor_ext {
-public:
-  using accessor_t =
-      typename detail::image_trait<T>::template accessor_t<dimensions>;
-  using data_t = typename detail::image_trait<T>::data_t;
-  sycl::sampler _sampler;
-  accessor_t _img_acc;
-
-public:
-  image_accessor_ext(sycl::sampler sampler, accessor_t acc)
-      : _sampler(sampler), _img_acc(acc) {}
-
-  /// Read data from accessor.
-  template <bool Available = dimensions == 3>
-  typename std::enable_if<Available, data_t>::type read(float x, float y,
-                                                        float z) {
-    return detail::fetch_data<T>()(
-        _img_acc.read(sycl::float4(x, y, z, 0), _sampler));
-  }
-  /// Read data from accessor.
-  template <class Coord0, class Coord1, class Coord2,
-            bool Available = dimensions == 3 &&
-                             std::is_integral<Coord0>::value
-                                 &&std::is_integral<Coord1>::value
-                                     &&std::is_integral<Coord2>::value>
-  typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y,
-                                                        Coord2 z) {
-    return detail::fetch_data<T>()(
-        _img_acc.read(sycl::int4(x, y, z, 0), _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 2>
-  typename std::enable_if<Available, data_t>::type read(float x, float y) {
-    return detail::fetch_data<T>()(
-        _img_acc.read(sycl::float2(x, y), _sampler));
-  }
-  /// Read data from accessor.
-  template <class Coord0, class Coord1,
-            bool Available = dimensions == 2 &&
-                             std::is_integral<Coord0>::value
-                                 &&std::is_integral<Coord1>::value>
-  typename std::enable_if<Available, data_t>::type read(Coord0 x, Coord1 y) {
-    return detail::fetch_data<T>()(
-        _img_acc.read(sycl::int2(x, y), _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 1>
-  typename std::enable_if<Available, data_t>::type read(float x) {
-    return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
-  }
-  /// Read data from accessor.
-  template <class CoordT,
-            bool Available = dimensions == 1 && std::is_integral<CoordT>::value>
-  typename std::enable_if<Available, data_t>::type read(CoordT x) {
-    return detail::fetch_data<T>()(_img_acc.read(x, _sampler));
-  }
-};
-
-template <class T, int dimensions> class image_accessor_ext<T, dimensions, true> {
-public:
-  using accessor_t =
-      typename detail::image_trait<T>::template array_accessor_t<dimensions>;
-  using data_t = typename detail::image_trait<T>::data_t;
-  sycl::sampler _sampler;
-  accessor_t _img_acc;
-
-public:
-  image_accessor_ext(sycl::sampler sampler, accessor_t acc)
-      : _sampler(sampler), _img_acc(acc) {}
-
-  /// Read data from accessor.
-  template <bool Available = dimensions == 2>
-  typename std::enable_if<Available, data_t>::type read(int index, float x,
-                                                        float y) {
-    return detail::fetch_data<T>()(
-        _img_acc[index].read(sycl::float2(x, y), _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 2>
-  typename std::enable_if<Available, data_t>::type read(int index, int x, int y) {
-    return detail::fetch_data<T>()(
-        _img_acc[index].read(sycl::int2(x, y), _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 1>
-  typename std::enable_if<Available, data_t>::type read(int index, float x) {
-    return detail::fetch_data<T>()(
-        _img_acc[index].read(x, _sampler));
-  }
-  /// Read data from accessor.
-  template <bool Available = dimensions == 1>
-  typename std::enable_if<Available, data_t>::type read(int index, int x) {
-    return detail::fetch_data<T>()(
-        _img_acc[index].read(x, _sampler));
-  }
-};
-
-/// Create image wrapper according to image data and sampling info.
-/// \return Pointer to image wrapper base class.
-/// \param data Image data used to create image wrapper.
-/// \param info Image sampling info used to create image wrapper.
-/// \returns Pointer to base class of created image wrapper object.
-static inline image_wrapper_base *create_image_wrapper(image_data data,
-                              sampling_info info) {
-  image_channel channel;
-  int dims = 1;
-  if (data.get_data_type() == image_data_type::matrix) {
-    auto matrix = (image_matrix_p)data.get_data_ptr();
-    channel = matrix->get_channel();
-    dims = matrix->get_dims();
-  } else {
-    if (data.get_data_type() == image_data_type::pitch) {
-      dims = 2;
-    }
-    channel = data.get_channel();
-  }
-
-  if (auto ret = detail::create_image_wrapper(channel, dims)) {
-    ret->set_sampling_info(info);
-    ret->set_data(data);
-    return ret;
-  }
-  return nullptr;
-}
-
-namespace detail {
-/// Create image according with given type \p T and \p dims.
-template <class T> static image_wrapper_base *create_image_wrapper(int dims) {
-  switch (dims) {
-  case 1:
-    return new image_wrapper<T, 1>();
-  case 2:
-    return new image_wrapper<T, 2>();
-  case 3:
-    return new image_wrapper<T, 3>();
-  default:
-    return nullptr;
-  }
-}
-/// Create image with given data type \p T, channel order and dims
-template <class T>
-static image_wrapper_base *create_image_wrapper(unsigned channel_num, int dims) {
-  switch (channel_num) {
-  case 1:
-    return create_image_wrapper<T>(dims);
-  case 2:
-    return create_image_wrapper<sycl::vec<T, 2>>(dims);
-  case 3:
-    return create_image_wrapper<sycl::vec<T, 3>>(dims);
-  case 4:
-    return create_image_wrapper<sycl::vec<T, 4>>(dims);
-  default:
-    return nullptr;
-  }
-}
-
-/// Create image with channel info and specified dimensions.
-static image_wrapper_base *create_image_wrapper(image_channel channel, int dims) {
-  switch (channel.get_channel_type()) {
-  case sycl::image_channel_type::fp16:
-    return create_image_wrapper<sycl::half>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::fp32:
-    return create_image_wrapper<float>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::signed_int8:
-    return create_image_wrapper<std::int8_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::signed_int16:
-    return create_image_wrapper<std::int16_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::signed_int32:
-    return create_image_wrapper<std::int32_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::unsigned_int8:
-    return create_image_wrapper<std::uint8_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::unsigned_int16:
-    return create_image_wrapper<std::uint16_t>(channel.get_channel_num(), dims);
-  case sycl::image_channel_type::unsigned_int32:
-    return create_image_wrapper<std::uint32_t>(channel.get_channel_num(), dims);
-  default:
-    return nullptr;
-  }
-}
-} // namespace detail
-
-} // namespace dpct
-
-#endif // !__DPCT_IMAGE_HPP__
diff --git a/dpct/kernel.hpp b/dpct/kernel.hpp
deleted file mode 100644
index 11d1321bb4086..0000000000000
--- a/dpct/kernel.hpp
+++ /dev/null
@@ -1,459 +0,0 @@
-//==---- kernel.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_KERNEL_HPP__
-#define __DPCT_KERNEL_HPP__
-
-#include <sycl/sycl.hpp>
-#ifdef _WIN32
-#include <unordered_set>
-#include <windows.h>
-#else
-#include <dlfcn.h>
-#endif
-
-#if defined(__has_include) && __has_include(<filesystem>)
-#include <filesystem>
-#elif defined(__has_include) && __has_include(<experimental/filesystem>)
-#include <experimental/filesystem>
-#else
-#error "SYCLomatic runtime requires C++ filesystem support"
-#endif
-
-#include <fstream>
-#include <image.hpp>
-#include <random>
-
-namespace dpct {
-
-typedef void (*kernel_functor)(sycl::queue &, const sycl::nd_range<3> &,
-                               unsigned int, void **, void **);
-
-struct kernel_function_info {
-  int max_work_group_size = 0;
-};
-
-static inline void get_kernel_function_info(kernel_function_info *kernel_info,
-                                            const void *function) {
-  kernel_info->max_work_group_size =
-      dpct::dev_mgr::instance()
-          .current_device()
-          .get_info<sycl::info::device::max_work_group_size>();
-}
-static inline kernel_function_info
-get_kernel_function_info(const void *function) {
-  kernel_function_info kernel_info;
-  kernel_info.max_work_group_size =
-      dpct::dev_mgr::instance()
-          .current_device()
-          .get_info<sycl::info::device::max_work_group_size>();
-  return kernel_info;
-}
-
-
-namespace detail {
-
-#if defined(__has_include) && __has_include(<filesystem>)
-namespace fs = std::filesystem;
-#else
-namespace fs = std::experimental::filesystem;
-#endif
-
-/// Write data to temporary file and return absolute path to temporary file.
-/// Temporary file is created in a temporary directory both of which have random
-/// names with only the user having access permissions.  Only one temporary file
-/// will be created in the temporary directory.
-static inline fs::path write_data_to_file(char const *const data, size_t size) {
-  std::error_code ec;
-
-  if (sizeof(size_t) >= sizeof(std::streamsize) &&
-      size > (std::numeric_limits<std::streamsize>::max)())
-    throw std::runtime_error("data file too large");
-
-  // random number generator
-  std::random_device dev;
-  std::mt19937 prng(dev());
-  std::uniform_int_distribution<uint64_t> rand(0);
-
-  // find temporary directory
-  auto tmp_dir = fs::temp_directory_path(ec);
-  if (ec)
-    throw std::runtime_error("could not find temporary directory");
-
-  // create private directory
-  std::stringstream directory;
-  fs::path directory_path;
-  constexpr int max_attempts = 5;
-  int i;
-
-  for (i = 0; i < max_attempts; i++) {
-    directory << std::hex << rand(prng);
-    directory_path = tmp_dir / directory.str();
-    if (fs::create_directory(directory_path)) {
-      break;
-    }
-  }
-  if (i == max_attempts)
-    throw std::runtime_error("could not create directory");
-
-  // only allow owner permissions to private directory
-  fs::permissions(directory_path, fs::perms::owner_all, ec);
-  if (ec)
-    throw std::runtime_error("could not set directory permissions");
-
-  // random filename in private directory
-  std::stringstream filename;
-  filename << std::hex << rand(prng);
-#ifdef _WIN32
-  auto filepath = directory_path / (filename.str() + ".dll");
-#else
-  auto filepath = directory_path / filename.str();
-#endif
-
-  // write data to temporary file
-  auto outfile = std::ofstream(filepath, std::ios::out | std::ios::binary);
-  if (outfile) {
-    // only allow program to write file
-    fs::permissions(filepath, fs::perms::owner_write, ec);
-    if (ec)
-      throw std::runtime_error("could not set permissions");
-
-    outfile.write(data, size);
-    if (!outfile.good())
-      throw std::runtime_error("could not write data");
-    outfile.close();
-
-    // only allow program to read/execute file
-    fs::permissions(filepath, fs::perms::owner_read | fs::perms::owner_exec,
-                    ec);
-    if (ec)
-      throw std::runtime_error("could not set permissions");
-  } else
-    throw std::runtime_error("could not write data");
-
-  // check temporary file contents
-  auto infile = std::ifstream(filepath, std::ios::in | std::ios::binary);
-  if (infile) {
-    bool mismatch = false;
-    size_t cnt = 0;
-
-    while (1) {
-      char c;
-      infile.get(c);
-      if (infile.eof())
-        break;
-      if (c != data[cnt++])
-        mismatch = true;
-    }
-    if (cnt != size || mismatch)
-      throw std::runtime_error("file contents not written correctly");
-  } else
-    throw std::runtime_error("could not validate file");
-
-  if (!filepath.is_absolute())
-    throw std::runtime_error("temporary filepath is not absolute");
-
-  return filepath;
-}
-
-static inline uint16_t extract16(unsigned char const *const ptr) {
-  uint16_t ret = 0;
-
-  ret |= static_cast<uint16_t>(ptr[0]) << 0;
-  ret |= static_cast<uint16_t>(ptr[1]) << 8;
-
-  return (ret);
-}
-
-static inline uint32_t extract32(unsigned char const *const ptr) {
-  uint32_t ret = 0;
-
-  ret |= static_cast<uint32_t>(ptr[0]) << 0;
-  ret |= static_cast<uint32_t>(ptr[1]) << 8;
-  ret |= static_cast<uint32_t>(ptr[2]) << 16;
-  ret |= static_cast<uint32_t>(ptr[3]) << 24;
-
-  return (ret);
-}
-
-static inline uint64_t extract64(unsigned char const *const ptr) {
-  uint64_t ret = 0;
-
-  ret |= static_cast<uint64_t>(ptr[0]) << 0;
-  ret |= static_cast<uint64_t>(ptr[1]) << 8;
-  ret |= static_cast<uint64_t>(ptr[2]) << 16;
-  ret |= static_cast<uint64_t>(ptr[3]) << 24;
-  ret |= static_cast<uint64_t>(ptr[4]) << 32;
-  ret |= static_cast<uint64_t>(ptr[5]) << 40;
-  ret |= static_cast<uint64_t>(ptr[6]) << 48;
-  ret |= static_cast<uint64_t>(ptr[7]) << 56;
-
-  return (ret);
-}
-
-static inline uint64_t get_lib_size(char const *const blob) {
-#ifdef _WIN32
-  ///////////////////////////////////////////////////////////////////////
-  // Analyze DOS stub
-  unsigned char const *const ublob =
-      reinterpret_cast<unsigned char const *const>(blob);
-  if (ublob[0] != 0x4d || ublob[1] != 0x5a) {
-    throw std::runtime_error("Blob is not a Windows DLL.");
-  }
-  uint32_t pe_header_offset = extract32(ublob + 0x3c);
-
-  ///////////////////////////////////////////////////////////////////////
-  // Ananlyze PE-header
-  unsigned char const *const pe_header = ublob + pe_header_offset;
-
-  // signature
-  uint32_t pe_signature = extract32(pe_header + 0);
-  if (pe_signature != 0x00004550) {
-    throw std::runtime_error("PE-header signature is not 0x00004550");
-  }
-
-  // machine
-  uint16_t machine = extract16(pe_header + 4);
-  if (machine != 0x8664) {
-    throw std::runtime_error("Only DLLs for x64 supported");
-  }
-
-  // number of sections
-  uint16_t number_of_sections = extract16(pe_header + 6);
-
-  // sizeof optional header
-  uint16_t sizeof_optional_header = extract16(pe_header + 20);
-
-  // magic
-  uint16_t magic = extract16(pe_header + 24);
-  if (magic != 0x10b && magic != 0x20b) {
-    throw std::runtime_error("MAGIC is not 0x010b or 0x020b");
-  }
-
-  ///////////////////////////////////////////////////////////////////////
-  // Analyze tail of optional header
-  constexpr int coff_header_size = 24;
-
-  unsigned char const *const tail_of_optional_header =
-      pe_header + coff_header_size + sizeof_optional_header;
-  if (extract64(tail_of_optional_header - 8) != 0) {
-    throw std::runtime_error("Optional header not zero-padded");
-  }
-
-  ///////////////////////////////////////////////////////////////////////
-  // Analyze last section header
-  constexpr int section_header_size = 40;
-  unsigned char const *const last_section_header =
-      tail_of_optional_header + section_header_size * (number_of_sections - 1);
-
-  uint32_t sizeof_raw_data = extract32(last_section_header + 16);
-  uint32_t pointer_to_raw_data = extract32(last_section_header + 20);
-
-  return sizeof_raw_data + pointer_to_raw_data;
-#else
-  if (blob[0] != 0x7F || blob[1] != 'E' || blob[2] != 'L' || blob[3] != 'F')
-    throw std::runtime_error("Blob is not in ELF format");
-
-  if (blob[4] != 0x02)
-    throw std::runtime_error("Only 64-bit headers are supported");
-
-  if (blob[5] != 0x01)
-    throw std::runtime_error("Only little-endian headers are supported");
-
-  unsigned char const *const ublob =
-      reinterpret_cast<unsigned char const *const>(blob);
-  uint64_t e_shoff = extract64(ublob + 0x28);
-  uint16_t e_shentsize = extract16(ublob + 0x3A);
-  uint16_t e_shnum = extract16(ublob + 0x3C);
-
-  return e_shoff + (e_shentsize * e_shnum);
-#endif
-}
-
-#ifdef _WIN32
-class path_lib_record {
-public:
-  void operator=(const path_lib_record &) = delete;
-  ~path_lib_record() {
-    for (auto entry : lib_to_path) {
-      FreeLibrary(static_cast<HMODULE>(entry.first));
-      fs::permissions(entry.second, fs::perms::owner_all);
-      fs::remove_all(entry.second.remove_filename());
-    }
-  }
-  static void record_lib_path(fs::path path, void *library) {
-    lib_to_path[library] = path;
-  }
-  static void remove_lib(void *library) {
-    auto path = lib_to_path[library];
-    std::error_code ec;
-
-    FreeLibrary(static_cast<HMODULE>(library));
-    fs::permissions(path, fs::perms::owner_all);
-    if (fs::remove_all(path.remove_filename(), ec) != 2 || ec)
-      // one directory and one temporary file should have been deleted
-      throw std::runtime_error("Directory delete failed");
-
-    lib_to_path.erase(library);
-  }
-
-private:
-  static inline std::unordered_map<void *, fs::path> lib_to_path;
-};
-#endif
-
-} // namespace detail
-
-class kernel_library {
-public:
-  kernel_library() : ptr{nullptr} {}
-  kernel_library(void *ptr) : ptr{ptr} {}
-
-  operator void *() const { return ptr; }
-
-private:
-  void *ptr;
-#ifdef _WIN32
-  static inline detail::path_lib_record single_instance_to_trigger_destructor;
-#endif
-};
-
-namespace detail {
-
-static inline kernel_library load_dl_from_data(char const *const data,
-                                               size_t size) {
-  fs::path filename = write_data_to_file(data, size);
-#ifdef _WIN32
-  void *so = LoadLibraryW(filename.wstring().c_str());
-#else
-  void *so = dlopen(filename.c_str(), RTLD_LAZY);
-#endif
-  if (so == nullptr)
-    throw std::runtime_error("Failed to load kernel library");
-
-#ifdef _WIN32
-  detail::path_lib_record::record_lib_path(filename, so);
-#else
-  std::error_code ec;
-
-  // Windows DLL cannot be deleted while in use
-  if (fs::remove_all(filename.remove_filename(), ec) != 2 || ec)
-    // one directory and one temporary file should have been deleted
-    throw std::runtime_error("Directory delete failed");
-#endif
-
-  return so;
-}
-
-} // namespace detail
-
-/// Load kernel library and return a handle to use the library.
-/// \param [in] name The name of the library.
-static inline kernel_library load_kernel_library(const std::string &name) {
-  std::ifstream ifs;
-  ifs.open(name, std::ios::in | std::ios::binary);
-
-  std::stringstream buffer;
-  buffer << ifs.rdbuf();
-
-  const std::string buffer_string = buffer.str();
-  return detail::load_dl_from_data(buffer_string.c_str(), buffer_string.size());
-}
-
-/// Load kernel library whose image is alreay in memory and return a handle to
-/// use the library.
-/// \param [in] image A pointer to the image in memory.
-static inline kernel_library load_kernel_library_mem(char const *const image) {
-  const size_t size = detail::get_lib_size(image);
-
-  return detail::load_dl_from_data(image, size);
-}
-
-/// Unload kernel library.
-/// \param [in,out] library Handle to the library to be closed.
-static inline void unload_kernel_library(const kernel_library &library) {
-#ifdef _WIN32
-  detail::path_lib_record::remove_lib(library);
-#else
-  dlclose(library);
-#endif
-}
-
-class kernel_function {
-public:
-  kernel_function() : ptr{nullptr} {}
-  kernel_function(dpct::kernel_functor ptr) : ptr{ptr} {}
-
-  operator void *() const { return ((void *)ptr); }
-
-  void operator()(sycl::queue &q, const sycl::nd_range<3> &range,
-                  unsigned int a, void **args, void **extra) {
-    ptr(q, range, a, args, extra);
-  }
-
-private:
-  dpct::kernel_functor ptr;
-};
-
-/// Find kernel function in a kernel library and return its address.
-/// \param [in] library Handle to the kernel library.
-/// \param [in] name Name of the kernel function.
-static inline dpct::kernel_function
-get_kernel_function(kernel_library &library, const std::string &name) {
-#ifdef _WIN32
-  dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
-      GetProcAddress(static_cast<HMODULE>(static_cast<void *>(library)),
-                     (name + std::string("_wrapper")).c_str()));
-#else
-  dpct::kernel_functor fn = reinterpret_cast<dpct::kernel_functor>(
-      dlsym(library, (name + std::string("_wrapper")).c_str()));
-#endif
-  if (fn == nullptr)
-    throw std::runtime_error("Failed to get function");
-  return fn;
-}
-
-/// Invoke a kernel function.
-/// \param [in] function kernel function.
-/// \param [in] queue SYCL queue used to execute kernel
-/// \param [in] groupRange SYCL group range
-/// \param [in] localRange SYCL local range
-/// \param [in] localMemSize The size of local memory required by the kernel
-///             function.
-/// \param [in] kernelParams Array of pointers to kernel arguments.
-/// \param [in] extra Extra arguments.
-static inline void invoke_kernel_function(dpct::kernel_function &function,
-                                          sycl::queue &queue,
-                                          sycl::range<3> groupRange,
-                                          sycl::range<3> localRange,
-                                          unsigned int localMemSize,
-                                          void **kernelParams, void **extra) {
-  function(queue, sycl::nd_range<3>(groupRange * localRange, localRange),
-           localMemSize, kernelParams, extra);
-}
-
-/// Find image wrapper in a kernel library and return its address.
-/// \param [in] library Handle to the kernel library.
-/// \param [in] name Name of the target image wrapper.
-static inline dpct::image_wrapper_base_p
-get_image_wrapper(dpct::kernel_library &library, const std::string &name) {
-#ifdef _WIN32
-  dpct::image_wrapper_base_p fn =
-      reinterpret_cast<dpct::image_wrapper_base_p>(GetProcAddress(
-          static_cast<HMODULE>(static_cast<void *>(library)), name.c_str()));
-#else
-  dpct::image_wrapper_base_p fn = reinterpret_cast<dpct::image_wrapper_base_p>(
-      dlsym(library, name.c_str()));
-#endif
-  if (fn == nullptr)
-    throw std::runtime_error("Failed to get image");
-  return fn;
-}
-
-} // namespace dpct
-#endif // __DPCT_KERNEL_HPP__
diff --git a/dpct/lapack_utils.hpp b/dpct/lapack_utils.hpp
deleted file mode 100644
index dac77d5773ec4..0000000000000
--- a/dpct/lapack_utils.hpp
+++ /dev/null
@@ -1,1953 +0,0 @@
-//==---- lapack_utils.hpp -------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_LAPACK_UTILS_HPP__
-#define __DPCT_LAPACK_UTILS_HPP__
-
-#include "memory.hpp"
-#include "util.hpp"
-#include "lib_common_utils.hpp"
-
-#include <oneapi/mkl.hpp>
-#include <sycl/sycl.hpp>
-
-namespace dpct {
-namespace lapack {
-/// Computes all eigenvalues and, optionally, eigenvectors of a real generalized
-/// symmetric definite eigenproblem using a divide and conquer method.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] queue Device queue where calculations will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] itype Must be 1 or 2 or 3. Specifies the problem type to be solved.
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrices A and B.
-/// \param [in,out] a The symmetric matrix A.
-/// \param [in] lda The leading dimension of matrix A.
-/// \param [in,out] b The symmetric matrix B.
-/// \param [in] ldb The leading dimension of matrix B.
-/// \param [out] w Eigenvalues.
-/// \param [in] scratchpad Scratchpad memory to be used by the routine
-/// for storing intermediate results.
-/// \param [in] scratchpad_size Size of scratchpad memory as a number of
-/// floating point elements of type T.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-template <typename T>
-inline int sygvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                 oneapi::mkl::uplo uplo, int n, T *a, int lda, T *b, int ldb,
-                 T *w, T *scratchpad, int scratchpad_size, int *info) {
-#ifdef DPCT_USM_LEVEL_NONE
-  auto info_buf = get_buffer<int>(info);
-  auto a_buffer = get_buffer<T>(a);
-  auto b_buffer = get_buffer<T>(b);
-  auto w_buffer = get_buffer<T>(w);
-  auto scratchpad_buffer = get_buffer<T>(scratchpad);
-  int info_val = 0;
-  int ret_val = 0;
-  try {
-    oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a_buffer, lda,
-                               b_buffer, ldb, w_buffer, scratchpad_buffer,
-                               scratchpad_size);
-  } catch (oneapi::mkl::lapack::exception const& e) {
-    std::cerr << "Unexpected exception caught during call to LAPACK API: sygvd"
-              << std::endl
-              << "reason: " << e.what() << std::endl
-              << "info: " << e.info() << std::endl;
-    info_val = static_cast<int>(e.info());
-    ret_val = 1;
-  } catch (sycl::exception const& e) {
-    std::cerr << "Caught synchronous SYCL exception:" << std::endl
-              << "reason: " << e.what() << std::endl;
-    ret_val = 1;
-  }
-  queue.submit([&, info_val](sycl::handler &cgh) {
-    auto info_acc = info_buf.get_access<sycl::access_mode::write>(cgh);
-    cgh.single_task<dpct_kernel_name<class sygvd_set_info, T>>(
-        [=]() { info_acc[0] = info_val; });
-  });
-  return ret_val;
-#else
-  try {
-    oneapi::mkl::lapack::sygvd(queue, itype, jobz, uplo, n, a, lda, b, ldb, w,
-                               scratchpad, scratchpad_size);
-  } catch (oneapi::mkl::lapack::exception const& e) {
-    std::cerr << "Unexpected exception caught during call to LAPACK API: sygvd"
-              << std::endl
-              << "reason: " << e.what() << std::endl
-              << "info: " << e.info() << std::endl;
-    int info_val = static_cast<int>(e.info());
-    queue.memcpy(info, &info_val, sizeof(int)).wait();
-    return 1;
-  } catch (sycl::exception const& e) {
-    std::cerr << "Caught synchronous SYCL exception:" << std::endl
-              << "reason: " << e.what() << std::endl;
-    queue.memset(info, 0, sizeof(int)).wait();
-    return 1;
-  }
-  queue.memset(info, 0, sizeof(int));
-  return 0;
-#endif
-}
-/// Computes all the eigenvalues, and optionally, the eigenvectors of a complex
-/// generalized Hermitian positive-definite eigenproblem using a divide and
-/// conquer method.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] queue Device queue where calculations will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] itype Must be 1 or 2 or 3. Specifies the problem type to be solved.
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrices A and B.
-/// \param [in,out] a The Hermitian matrix A.
-/// \param [in] lda The leading dimension of matrix A.
-/// \param [in,out] b The Hermitian matrix B.
-/// \param [in] ldb The leading dimension of matrix B.
-/// \param [in] w Eigenvalues.
-/// \param [in] scratchpad Scratchpad memory to be used by the routine
-/// for storing intermediate results.
-/// \param [in] scratchpad_size Size of scratchpad memory as a number of
-/// floating point elements of type T.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-template <typename T, typename Tw>
-inline int hegvd(sycl::queue &queue, std::int64_t itype, oneapi::mkl::job jobz,
-                 oneapi::mkl::uplo uplo, int n, T *a, int lda, T *b, int ldb,
-                 Tw *w, T *scratchpad, int scratchpad_size, int *info) {
-  using Ty = typename DataType<T>::T2;
-#ifdef DPCT_USM_LEVEL_NONE
-  auto info_buf = get_buffer<int>(info);
-  auto a_buffer = get_buffer<Ty>(a);
-  auto b_buffer = get_buffer<Ty>(b);
-  auto w_buffer = get_buffer<Tw>(w);
-  auto scratchpad_buffer = get_buffer<Ty>(scratchpad);
-  int info_val = 0;
-  int ret_val = 0;
-  try {
-    oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, a_buffer, lda,
-                               b_buffer, ldb, w_buffer, scratchpad_buffer,
-                               scratchpad_size);
-  } catch (oneapi::mkl::lapack::exception const& e) {
-    std::cerr << "Unexpected exception caught during call to LAPACK API: hegvd"
-              << std::endl
-              << "reason: " << e.what() << std::endl
-              << "info: " << e.info() << std::endl;
-    info_val = static_cast<int>(e.info());
-    ret_val = 1;
-  } catch (sycl::exception const& e) {
-    std::cerr << "Caught synchronous SYCL exception:" << std::endl
-              << "reason: " << e.what() << std::endl;
-    ret_val = 1;
-  }
-  queue.submit([&, info_val](sycl::handler &cgh) {
-    auto info_acc = info_buf.get_access<sycl::access_mode::write>(cgh);
-    cgh.single_task<dpct_kernel_name<class hegvd_set_info, T>>(
-        [=]() { info_acc[0] = info_val; });
-  });
-  return ret_val;
-#else
-  try {
-    oneapi::mkl::lapack::hegvd(queue, itype, jobz, uplo, n, (Ty *)a, lda, (Ty *)b,
-                               ldb, w, (Ty *)scratchpad, scratchpad_size);
-  } catch (oneapi::mkl::lapack::exception const& e) {
-    std::cerr << "Unexpected exception caught during call to LAPACK API: hegvd"
-              << std::endl
-              << "reason: " << e.what() << std::endl
-              << "info: " << e.info() << std::endl;
-    int info_val = static_cast<int>(e.info());
-    queue.memcpy(info, &info_val, sizeof(int)).wait();
-    return 1;
-  } catch (sycl::exception const& e) {
-    std::cerr << "Caught synchronous SYCL exception:" << std::endl
-              << "reason: " << e.what() << std::endl;
-    queue.memset(info, 0, sizeof(int)).wait();
-    return 1;
-  }
-  queue.memset(info, 0, sizeof(int));
-  return 0;
-#endif
-}
-/// Computes the Cholesky factorizations of a batch of symmetric (or Hermitian,
-/// for complex data) positive-definite matrices.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] queue Device queue where calculations will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in,out] a Array of pointers to matrix A.
-/// \param [in] lda The leading dimension of matrix A.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-/// \param [in] group_size The batch size.
-template <typename T>
-inline int potrf_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, int n,
-                       T *a[], int lda, int *info, int group_size) {
-#ifdef DPCT_USM_LEVEL_NONE
-  throw std::runtime_error("this API is unsupported when USM level is none");
-#else
-  using Ty = typename DataType<T>::T2;
-  struct matrix_info_t {
-    oneapi::mkl::uplo uplo_info;
-    std::int64_t n_info;
-    std::int64_t lda_info;
-    std::int64_t group_size_info;
-  };
-  matrix_info_t *matrix_info =
-      (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
-  matrix_info->uplo_info = uplo;
-  matrix_info->n_info = n;
-  matrix_info->lda_info = lda;
-  matrix_info->group_size_info = group_size;
-  std::int64_t scratchpad_size = 0;
-  sycl::event e;
-  Ty *scratchpad = nullptr;
-  try {
-    scratchpad_size = oneapi::mkl::lapack::potrf_batch_scratchpad_size<Ty>(
-        queue, &(matrix_info->uplo_info), &(matrix_info->n_info),
-        &(matrix_info->lda_info), 1, &(matrix_info->group_size_info));
-    scratchpad = sycl::malloc_device<Ty>(scratchpad_size, queue);
-    e = oneapi::mkl::lapack::potrf_batch(
-        queue, &(matrix_info->uplo_info), &(matrix_info->n_info), (Ty **)a,
-        &(matrix_info->lda_info), 1, &(matrix_info->group_size_info),
-        scratchpad, scratchpad_size);
-  } catch (oneapi::mkl::lapack::batch_error const &be) {
-    std::cerr << "Unexpected exception caught during call to LAPACK API: "
-                 "potrf_batch_scratchpad_size/potrf_batch"
-              << std::endl
-              << "reason: " << be.what() << std::endl
-              << "number: " << be.info() << std::endl;
-    int i = 0;
-    auto &ids = be.ids();
-    std::vector<int> info_vec(group_size);
-    for (auto const &e : be.exceptions()) {
-      try {
-        std::rethrow_exception(e);
-      } catch (oneapi::mkl::lapack::exception &e) {
-        std::cerr << "Exception " << ids[i] << std::endl
-                  << "reason: " << e.what() << std::endl
-                  << "info: " << e.info() << std::endl;
-        info_vec[i] = e.info();
-        i++;
-      }
-    }
-    queue.memcpy(info, info_vec.data(), group_size * sizeof(int)).wait();
-    std::free(matrix_info);
-    if (scratchpad)
-      sycl::free(scratchpad, queue);
-    return 1;
-  } catch (sycl::exception const &e) {
-    std::cerr << "Caught synchronous SYCL exception:" << std::endl
-              << "reason: " << e.what() << std::endl;
-    queue.memset(info, 0, group_size * sizeof(int)).wait();
-    std::free(matrix_info);
-    if (scratchpad)
-      sycl::free(scratchpad, queue);
-    return 1;
-  }
-  queue.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(e);
-    cgh.host_task([=] {
-      std::free(matrix_info);
-      sycl::free(scratchpad, queue);
-    });
-  });
-  queue.memset(info, 0, group_size * sizeof(int));
-  return 0;
-#endif
-}
-/// Solves a batch of systems of linear equations with a Cholesky-factored
-/// symmetric (Hermitian) positive-definite coefficient matrices.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] queue Device queue where calculations will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] nrhs The number of right-hand sides.
-/// \param [in,out] a Array of pointers to matrix A.
-/// \param [in] lda The leading dimension of matrix A.
-/// \param [in,out] b Array of pointers to matrix B.
-/// \param [in] ldb The leading dimension of matrix B.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-/// \param [in] group_size The batch size.
-template <typename T>
-inline int potrs_batch(sycl::queue &queue, oneapi::mkl::uplo uplo, int n,
-                       int nrhs, T *a[], int lda, T *b[], int ldb, int *info,
-                       int group_size) {
-#ifdef DPCT_USM_LEVEL_NONE
-  throw std::runtime_error("this API is unsupported when USM level is none");
-#else
-  using Ty = typename DataType<T>::T2;
-  struct matrix_info_t {
-    oneapi::mkl::uplo uplo_info;
-    std::int64_t n_info;
-    std::int64_t nrhs_info;
-    std::int64_t lda_info;
-    std::int64_t ldb_info;
-    std::int64_t group_size_info;
-  };
-  matrix_info_t *matrix_info =
-      (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
-  matrix_info->uplo_info = uplo;
-  matrix_info->n_info = n;
-  matrix_info->nrhs_info = nrhs;
-  matrix_info->lda_info = lda;
-  matrix_info->ldb_info = ldb;
-  matrix_info->group_size_info = group_size;
-  std::int64_t scratchpad_size = 0;
-  sycl::event e;
-  Ty *scratchpad = nullptr;
-  try {
-    scratchpad_size = oneapi::mkl::lapack::potrs_batch_scratchpad_size<Ty>(
-        queue, &(matrix_info->uplo_info), &(matrix_info->n_info),
-        &(matrix_info->nrhs_info), &(matrix_info->lda_info),
-        &(matrix_info->ldb_info), 1, &(matrix_info->group_size_info));
-    scratchpad = sycl::malloc_device<Ty>(scratchpad_size, queue);
-    e = oneapi::mkl::lapack::potrs_batch(
-        queue, &(matrix_info->uplo_info), &(matrix_info->n_info),
-        &(matrix_info->nrhs_info), (Ty **)a, &(matrix_info->lda_info), (Ty **)b,
-        &(matrix_info->ldb_info), 1, &(matrix_info->group_size_info),
-        scratchpad, scratchpad_size);
-  } catch (oneapi::mkl::lapack::batch_error const &be) {
-    std::cerr << "Unexpected exception caught during call to LAPACK API: "
-                 "potrs_batch_scratchpad_size/potrs_batch"
-              << std::endl
-              << "reason: " << be.what() << std::endl
-              << "number: " << be.info() << std::endl;
-    int i = 0;
-    auto &ids = be.ids();
-    std::vector<int> info_vec(group_size);
-    for (auto const &e : be.exceptions()) {
-      try {
-        std::rethrow_exception(e);
-      } catch (oneapi::mkl::lapack::exception &e) {
-        std::cerr << "Exception " << ids[i] << std::endl
-                  << "reason: " << e.what() << std::endl
-                  << "info: " << e.info() << std::endl;
-        info_vec[i] = e.info();
-        i++;
-      }
-    }
-    queue.memcpy(info, info_vec.data(), group_size * sizeof(int)).wait();
-    std::free(matrix_info);
-    if (scratchpad)
-      sycl::free(scratchpad, queue);
-    return 1;
-  } catch (sycl::exception const &e) {
-    std::cerr << "Caught synchronous SYCL exception:" << std::endl
-              << "reason: " << e.what() << std::endl;
-    queue.memset(info, 0, group_size * sizeof(int)).wait();
-    std::free(matrix_info);
-    if (scratchpad)
-      sycl::free(scratchpad, queue);
-    return 1;
-  }
-  queue.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(e);
-    cgh.host_task([=] {
-      std::free(matrix_info);
-      sycl::free(scratchpad, queue);
-    });
-  });
-  queue.memset(info, 0, group_size * sizeof(int));
-  return 0;
-#endif
-}
-
-namespace detail {
-template <template <typename> typename functor_t, typename... args_t>
-inline int lapack_shim(sycl::queue &q, library_data_t a_type, int *info,
-                       std::string const &lapack_api_name, args_t &&...args) {
-  auto handle_lapack_exception = [&](const oneapi::mkl::lapack::exception &e) {
-    std::cerr << "Unexpected exception caught during call to LAPACK API: "
-              << lapack_api_name << std::endl
-              << "reason: " << e.what() << std::endl
-              << "info: " << e.info() << std::endl
-              << "detail: " << e.detail() << std::endl;
-    if (e.info() < std::numeric_limits<int>::min() ||
-        e.info() > std::numeric_limits<int>::max()) {
-      throw std::runtime_error("e.info() exceeds the limit of int type");
-    }
-    int info_val = static_cast<int>(e.info());
-    if (info)
-      dpct::detail::dpct_memcpy(q, info, &info_val, sizeof(int),
-                                memcpy_direction::host_to_device)
-          .wait();
-    return 1;
-  };
-  try {
-    switch (a_type) {
-    case library_data_t::real_float: {
-      functor_t<float>()(std::forward<args_t>(args)...);
-      break;
-    }
-    case library_data_t::real_double: {
-      functor_t<double>()(std::forward<args_t>(args)...);
-      break;
-    }
-    case library_data_t::complex_float: {
-      functor_t<std::complex<float>>()(std::forward<args_t>(args)...);
-      break;
-    }
-    case library_data_t::complex_double: {
-      functor_t<std::complex<double>>()(std::forward<args_t>(args)...);
-      break;
-    }
-    default:
-      throw std::runtime_error("the data type is unsupported");
-    }
-  } catch (oneapi::mkl::lapack::batch_error const &be) {
-    try {
-      std::rethrow_exception(be.exceptions()[0]);
-    } catch (oneapi::mkl::lapack::exception &e) {
-      return handle_lapack_exception(e);
-    }
-  } catch (oneapi::mkl::lapack::exception const &e) {
-    return handle_lapack_exception(e);
-  } catch (sycl::exception const &e) {
-    std::cerr << "Caught synchronous SYCL exception:" << std::endl
-              << "reason: " << e.what() << std::endl;
-    if (info)
-      dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int)).wait();
-    return 1;
-  }
-  return 0;
-}
-
-template <typename T> class working_memory {
-public:
-  working_memory(std::size_t element_number, const sycl::queue &q) : _q(q) {
-    _ptr = dpct::detail::dpct_malloc(element_number * sizeof(T), _q);
-  }
-  auto get_memory() {
-    return dpct::detail::get_memory<T>(_ptr);
-  }
-  auto get_ptr() {
-    return _ptr;
-  }
-  void set_event(sycl::event e) { _e = e; }
-  ~working_memory() {
-    if (_ptr) {
-      dpct::async_dpct_free({_ptr}, {_e}, _q);
-    }
-  }
-
-private:
-  void *_ptr = nullptr;
-  sycl::event _e;
-  sycl::queue _q;
-};
-
-std::size_t byte_to_element_number(std::size_t size_in_byte,
-                                   dpct::library_data_t element_type) {
-  auto dv = std::lldiv(
-      size_in_byte,
-      dpct::detail::library_data_size[static_cast<unsigned int>(element_type)] /
-          8);
-  if (dv.rem) {
-    throw std::runtime_error(
-        "size_in_byte is not divisible by the size of element (in bytes)");
-  }
-  return dv.quot;
-}
-std::size_t element_number_to_byte(std::size_t size_in_element,
-                                   dpct::library_data_t element_type) {
-  auto dv = std::lldiv(
-      dpct::detail::library_data_size[static_cast<unsigned int>(element_type)],
-      8);
-  if (dv.rem) {
-    throw std::runtime_error(
-        "the size of element (in bits) is not divisible by 8");
-  }
-  return size_in_element * dv.quot;
-}
-
-inline oneapi::mkl::jobsvd char2jobsvd(signed char job) {
-  switch (job) {
-  case 'A':
-    return oneapi::mkl::jobsvd::vectors;
-  case 'S':
-    return oneapi::mkl::jobsvd::somevec;
-  case 'O':
-    return oneapi::mkl::jobsvd::vectorsina;
-  case 'N':
-    return oneapi::mkl::jobsvd::novec;
-  default:
-    throw std::runtime_error("the job type is unsupported");
-  }
-}
-
-template <typename T> struct getrf_scratchpad_size_impl {
-  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
-                  library_data_t a_type, std::int64_t lda,
-                  std::size_t &device_ws_size) {
-    device_ws_size =
-        oneapi::mkl::lapack::getrf_scratchpad_size<T>(q, m, n, lda);
-  }
-};
-
-template <typename T> struct getrf_impl {
-  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
-                  library_data_t a_type, void *a, std::int64_t lda,
-                  std::int64_t *ipiv, void *device_ws,
-                  std::size_t device_ws_size, int *info) {
-    auto ipiv_data = dpct::detail::get_memory<std::int64_t>(ipiv);
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    oneapi::mkl::lapack::getrf(q, m, n, a_data, lda, ipiv_data, device_ws_data,
-                               device_ws_size);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-  }
-};
-
-template <typename T> struct getrs_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::transpose trans, std::int64_t n,
-                  std::int64_t nrhs, library_data_t a_type, void *a,
-                  std::int64_t lda, std::int64_t *ipiv, library_data_t b_type,
-                  void *b, std::int64_t ldb, int *info) {
-    auto ipiv_data = dpct::detail::get_memory<std::int64_t>(ipiv);
-    std::int64_t device_ws_size = oneapi::mkl::lapack::getrs_scratchpad_size<T>(
-        q, trans, n, nrhs, lda, ldb);
-    working_memory<T> device_ws(device_ws_size, q);
-    auto device_ws_data = device_ws.get_memory();
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto b_data = dpct::detail::get_memory<T>(b);
-    oneapi::mkl::lapack::getrs(q, trans, n, nrhs, a_data, lda, ipiv_data,
-                               b_data, ldb, device_ws_data, device_ws_size);
-    sycl::event e = dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-    device_ws.set_event(e);
-  }
-};
-
-template <typename T> struct geqrf_scratchpad_size_impl {
-  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
-                  library_data_t a_type, std::int64_t lda,
-                  std::size_t &device_ws_size) {
-    device_ws_size =
-        oneapi::mkl::lapack::geqrf_scratchpad_size<T>(q, m, n, lda);
-  }
-};
-
-template <typename T> struct geqrf_impl {
-  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
-                  library_data_t a_type, void *a, std::int64_t lda,
-                  library_data_t tau_type, void *tau, void *device_ws,
-                  std::size_t device_ws_size, int *info) {
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto tau_data = dpct::detail::get_memory<T>(tau);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    oneapi::mkl::lapack::geqrf(q, m, n, a_data, lda, tau_data, device_ws_data,
-                               device_ws_size);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-  }
-};
-
-template <typename T> struct getrfnp_impl {
-  void operator()(sycl::queue &q, std::int64_t m, std::int64_t n,
-                  library_data_t a_type, void *a, std::int64_t lda,
-                  std::int64_t *ipiv, void *device_ws,
-                  std::size_t device_ws_size, int *info) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    std::int64_t a_stride = m * lda;
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    oneapi::mkl::lapack::getrfnp_batch(q, m, n, a_data, lda, a_stride, 1,
-                                       device_ws_data, device_ws_size);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-#endif
-  }
-};
-
-template <typename T> struct gesvd_scratchpad_size_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::jobsvd jobu,
-                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                  library_data_t a_type, std::int64_t lda,
-                  library_data_t u_type, std::int64_t ldu,
-                  library_data_t vt_type, std::int64_t ldvt,
-                  std::size_t &device_ws_size) {
-    device_ws_size = oneapi::mkl::lapack::gesvd_scratchpad_size<T>(
-        q, jobu, jobvt, m, n, lda, ldu, ldvt);
-  }
-};
-
-template <typename T> struct ElementType {
-  using value_tpye = T;
-};
-template <typename T> struct ElementType<std::complex<T>> {
-  using value_tpye = T;
-};
-template <typename T> struct gesvd_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::jobsvd jobu,
-                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                  library_data_t a_type, void *a, std::int64_t lda,
-                  library_data_t s_type, void *s, library_data_t u_type,
-                  void *u, std::int64_t ldu, library_data_t vt_type, void *vt,
-                  std::int64_t ldvt, void *device_ws,
-                  std::size_t device_ws_size, int *info) {
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto s_data =
-        dpct::detail::get_memory<typename ElementType<T>::value_tpye>(s);
-    auto u_data = dpct::detail::get_memory<T>(u);
-    auto vt_data = dpct::detail::get_memory<T>(vt);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    oneapi::mkl::lapack::gesvd(q, jobu, jobvt, m, n, a_data, lda, s_data,
-                               u_data, ldu, vt_data, ldvt, device_ws_data,
-                               device_ws_size);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-  }
-};
-template <typename T> struct gesvd_conj_impl : public gesvd_impl<T> {
-  void operator()(sycl::queue &q, oneapi::mkl::jobsvd jobu,
-                  oneapi::mkl::jobsvd jobvt, std::int64_t m, std::int64_t n,
-                  library_data_t a_type, void *a, std::int64_t lda,
-                  library_data_t s_type, void *s, library_data_t u_type,
-                  void *u, std::int64_t ldu, library_data_t vt_type, void *vt,
-                  std::int64_t ldvt, void *device_ws,
-                  std::size_t device_ws_size, int *info) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    using base = gesvd_impl<T>;
-    base::operator()(q, jobu, jobvt, m, n, a_type, a, lda, s_type, s, u_type, u,
-                     ldu, vt_type, vt, ldvt, device_ws, device_ws_size, info);
-    auto vt_data = dpct::detail::get_memory<T>(vt);
-    oneapi::mkl::blas::row_major::imatcopy(q, oneapi::mkl::transpose::conjtrans,
-                                           n, n, T(1.0f), vt_data, ldvt, ldvt);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-#endif
-  }
-};
-
-template <typename T> struct potrf_scratchpad_size_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
-                  library_data_t a_type, std::int64_t lda,
-                  std::size_t &device_ws_size) {
-    device_ws_size =
-        oneapi::mkl::lapack::potrf_scratchpad_size<T>(q, uplo, n, lda);
-  }
-};
-
-template <typename T> struct potrf_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
-                  library_data_t a_type, void *a, std::int64_t lda,
-                  void *device_ws, std::size_t device_ws_size, int *info) {
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    oneapi::mkl::lapack::potrf(q, uplo, n, a_data, lda, device_ws_data,
-                               device_ws_size);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-  }
-};
-
-template <typename T> struct potrs_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
-                  std::int64_t nrhs, library_data_t a_type, void *a,
-                  std::int64_t lda, library_data_t b_type, void *b,
-                  std::int64_t ldb, int *info) {
-    std::int64_t device_ws_size = oneapi::mkl::lapack::potrs_scratchpad_size<T>(
-        q, uplo, n, nrhs, lda, ldb);
-    working_memory<T> device_ws(device_ws_size, q);
-    auto device_ws_data = device_ws.get_memory();
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto b_data = dpct::detail::get_memory<T>(b);
-    oneapi::mkl::lapack::potrs(q, uplo, n, nrhs, a_data, lda, b_data, ldb,
-                               device_ws_data, device_ws_size);
-    sycl::event e = dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-    device_ws.set_event(e);
-  }
-};
-
-template <typename T> struct value_type_trait {
-  using value_type = T;
-};
-template <typename T> struct value_type_trait<std::complex<T>> {
-  using value_type = T;
-};
-
-template <typename T> auto lamch_s() {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  if constexpr (std::is_same_v<T, float>) {
-    return slamch("S");
-  } else if constexpr (std::is_same_v<T, double>) {
-    return dlamch("S");
-  }
-  throw std::runtime_error("the type is unsupported");
-#endif
-}
-
-#define DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(FUNC, ...)                          \
-  do {                                                                         \
-    if constexpr (std::is_floating_point_v<T>) {                               \
-      device_ws_size = oneapi::mkl::lapack::sy##FUNC(__VA_ARGS__);             \
-    } else {                                                                   \
-      device_ws_size = oneapi::mkl::lapack::he##FUNC(__VA_ARGS__);             \
-    }                                                                          \
-  } while (0)
-
-#define DISPATCH_FLOAT_FOR_CALCULATION(FUNC, ...)                              \
-  do {                                                                         \
-    if constexpr (std::is_floating_point_v<T>) {                               \
-      oneapi::mkl::lapack::sy##FUNC(__VA_ARGS__);                              \
-    } else {                                                                   \
-      oneapi::mkl::lapack::he##FUNC(__VA_ARGS__);                              \
-    }                                                                          \
-  } while (0)
-
-template <typename T> struct syheevx_scratchpad_size_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::compz jobz,
-                  oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::int64_t lda, void *vl, void *vu,
-                  std::int64_t il, std::int64_t iu,
-                  std::size_t &device_ws_size) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    using value_t = typename value_type_trait<T>::value_type;
-    auto vl_value = *reinterpret_cast<value_t *>(vl);
-    auto vu_value = *reinterpret_cast<value_t *>(vu);
-    auto abstol = 2 * lamch_s<value_t>();
-    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(evx_scratchpad_size<T>, q, jobz, range,
-                                       uplo, n, lda, vl_value, vu_value, il, iu,
-                                       abstol, lda);
-#endif
-  }
-};
-
-template <typename T> constexpr library_data_t get_library_data_t_from_type() {
-  if constexpr (std::is_same_v<T, float>) {
-    return library_data_t::real_float;
-  } else if constexpr (std::is_same_v<T, double>) {
-    return library_data_t::real_double;
-  } else if constexpr (std::is_same_v<T, sycl::float2> ||
-                       std::is_same_v<T, std::complex<float>>) {
-    return library_data_t::complex_float;
-  } else if constexpr (std::is_same_v<T, sycl::double2> ||
-                       std::is_same_v<T, std::complex<double>>) {
-    return library_data_t::complex_double;
-  }
-  throw std::runtime_error("the type is unsupported");
-}
-
-template <typename T> struct syheevx_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::compz jobz,
-                  oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
-                  std::int64_t n, library_data_t a_type, void *a,
-                  std::int64_t lda, void *vl, void *vu, std::int64_t il,
-                  std::int64_t iu, std::int64_t *m, library_data_t w_type,
-                  void *w, void *device_ws, std::size_t device_ws_size,
-                  int *info) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    using value_t = typename value_type_trait<T>::value_type;
-    working_memory<T> z(n * lda, q);
-    working_memory<std::int64_t> m_device(1, q);
-    auto z_data = z.get_memory();
-    auto m_device_data = m_device.get_memory();
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    auto vl_value = *reinterpret_cast<value_t *>(vl);
-    auto vu_value = *reinterpret_cast<value_t *>(vu);
-    auto w_data = dpct::detail::get_memory<value_t>(w);
-    auto abstol = 2 * lamch_s<value_t>();
-    DISPATCH_FLOAT_FOR_CALCULATION(evx, q, jobz, range, uplo, n, a_data, lda,
-                                   vl_value, vu_value, il, iu, abstol,
-                                   m_device_data, w_data, z_data, lda,
-                                   device_ws_data, device_ws_size);
-    dpct::async_dpct_memcpy(a, z.get_ptr(), n * lda * sizeof(T),
-                            memcpy_direction::device_to_device, q);
-    dpct::async_dpct_memcpy(m, m_device.get_ptr(), sizeof(std::int64_t),
-                            memcpy_direction::device_to_host, q);
-    sycl::event e = dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-    z.set_event(e);
-    m_device.set_event(e);
-#endif
-  }
-};
-
-template <typename T> struct syhegvx_scratchpad_size_impl {
-  void operator()(sycl::queue &q, std::int64_t itype, oneapi::mkl::compz jobz,
-                  oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
-                  std::int64_t n, std::int64_t lda, std::int64_t ldb, void *vl,
-                  void *vu, std::int64_t il, std::int64_t iu,
-                  std::size_t &device_ws_size) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    using value_t = typename value_type_trait<T>::value_type;
-    auto vl_value = *reinterpret_cast<value_t *>(vl);
-    auto vu_value = *reinterpret_cast<value_t *>(vu);
-    auto abstol = 2 * lamch_s<value_t>();
-    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(gvx_scratchpad_size<T>, q, itype, jobz,
-                                       range, uplo, n, lda, ldb, vl_value,
-                                       vu_value, il, iu, abstol, lda);
-#endif
-  }
-};
-
-template <typename T> struct syhegvx_impl {
-  void operator()(sycl::queue &q, std::int64_t itype, oneapi::mkl::compz jobz,
-                  oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
-                  std::int64_t n, void *a, std::int64_t lda, void *b,
-                  std::int64_t ldb, void *vl, void *vu, std::int64_t il,
-                  std::int64_t iu, std::int64_t *m, void *w, void *device_ws,
-                  std::size_t device_ws_size, int *info) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    using value_t = typename value_type_trait<T>::value_type;
-    working_memory<T> z(n * lda, q);
-    working_memory<std::int64_t> m_device(1, q);
-    auto z_data = z.get_memory();
-    auto m_device_data = m_device.get_memory();
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto b_data = dpct::detail::get_memory<T>(b);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    auto vl_value = *reinterpret_cast<value_t *>(vl);
-    auto vu_value = *reinterpret_cast<value_t *>(vu);
-    auto w_data = dpct::detail::get_memory<value_t>(w);
-    auto abstol = 2 * lamch_s<value_t>();
-    DISPATCH_FLOAT_FOR_CALCULATION(gvx, q, itype, jobz, range, uplo, n, a_data,
-                                   lda, b_data, ldb, vl_value, vu_value, il, iu,
-                                   abstol, m_device_data, w_data, z_data, lda,
-                                   device_ws_data, device_ws_size);
-    dpct::async_dpct_memcpy(a, z.get_ptr(), n * lda * sizeof(T),
-                            memcpy_direction::device_to_device, q);
-    dpct::async_dpct_memcpy(m, m_device.get_ptr(), sizeof(std::int64_t),
-                            memcpy_direction::device_to_host, q);
-    sycl::event e = dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-    z.set_event(e);
-    m_device.set_event(e);
-#endif
-  }
-};
-
-template <typename T> struct syhegvd_scratchpad_size_impl {
-  void operator()(sycl::queue &q, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                  std::int64_t ldb, std::size_t &device_ws_size) {
-    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(gvd_scratchpad_size<T>, q, itype, jobz,
-                                       uplo, n, lda, ldb);
-  }
-};
-
-template <typename T> struct syhegvd_impl {
-  void operator()(sycl::queue &q, std::int64_t itype, oneapi::mkl::job jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, void *a,
-                  std::int64_t lda, void *b, std::int64_t ldb, void *w,
-                  void *device_ws, std::size_t device_ws_size,
-                  int *info) {
-    using value_t = typename value_type_trait<T>::value_type;
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto b_data = dpct::detail::get_memory<T>(b);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    auto w_data = dpct::detail::get_memory<value_t>(w);
-    DISPATCH_FLOAT_FOR_CALCULATION(gvd, q, itype, jobz, uplo, n, a_data, lda,
-                                   b_data, ldb, w_data, device_ws_data,
-                                   device_ws_size);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-  }
-};
-
-oneapi::mkl::compz job2compz(const oneapi::mkl::job &job) {
-  oneapi::mkl::compz ret;
-  if (job == oneapi::mkl::job::novec) {
-    ret = oneapi::mkl::compz::novectors;
-  } else if (job == oneapi::mkl::job::vec) {
-    ret = oneapi::mkl::compz::vectors;
-  } else {
-    throw std::runtime_error("the job type is unsupported");
-  }
-  return ret;
-}
-
-template <typename T> struct syheev_scratchpad_size_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::compz jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, std::int64_t lda,
-                  std::size_t &device_ws_size) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(ev_scratchpad_size<T>, q, jobz, uplo, n,
-                                       lda);
-#endif
-  }
-};
-
-template <typename T> struct syheev_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::compz jobz,
-                  oneapi::mkl::uplo uplo, std::int64_t n, void *a,
-                  std::int64_t lda, void *w, void *device_ws,
-                  std::size_t device_ws_size, int *info) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    using value_t = typename value_type_trait<T>::value_type;
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    auto w_data = dpct::detail::get_memory<value_t>(w);
-    DISPATCH_FLOAT_FOR_CALCULATION(ev, q, jobz, uplo, n, a_data, lda, w_data,
-                                   device_ws_data, device_ws_size);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-#endif
-  }
-};
-
-template <typename T> struct syheevd_scratchpad_size_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                  std::int64_t n, library_data_t a_type, std::int64_t lda,
-                  std::size_t &device_ws_size) {
-    DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE(evd_scratchpad_size<T>, q, jobz, uplo, n,
-                                       lda);
-  }
-};
-
-template <typename T> struct syheevd_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                  std::int64_t n, library_data_t a_type, void *a,
-                  std::int64_t lda, void *w, void *device_ws,
-                  std::size_t device_ws_size, int *info) {
-    using value_t = typename value_type_trait<T>::value_type;
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    auto w_data = dpct::detail::get_memory<value_t>(w);
-    DISPATCH_FLOAT_FOR_CALCULATION(evd, q, jobz, uplo, n, a_data, lda, w_data,
-                                   device_ws_data, device_ws_size);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-  }
-};
-
-#undef DISPATCH_FLOAT_FOR_SCRATCHPAD_SIZE
-#undef DISPATCH_FLOAT_FOR_CALCULATION
-
-template <typename T> struct trtri_scratchpad_size_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::diag diag, std::int64_t n, library_data_t a_type,
-                  std::int64_t lda, std::size_t &device_ws_size) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    device_ws_size =
-        oneapi::mkl::lapack::trtri_scratchpad_size<T>(q, uplo, diag, n, lda);
-#endif
-  }
-};
-
-template <typename T> struct trtri_impl {
-  void operator()(sycl::queue &q, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::diag diag, std::int64_t n, library_data_t a_type,
-                  void *a, std::int64_t lda, void *device_ws,
-                  std::size_t device_ws_size, int *info) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    auto a_data = dpct::detail::get_memory<T>(a);
-    auto device_ws_data = dpct::detail::get_memory<T>(device_ws);
-    oneapi::mkl::lapack::trtri(q, uplo, diag, n, a_data, lda, device_ws_data,
-                               device_ws_size);
-    dpct::detail::dpct_memset<unsigned char>(q, info, 0, sizeof(int));
-#endif
-  }
-};
-} // namespace detail
-
-/// Computes the size of workspace memory of getrf function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] m The number of rows in the matrix A.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [out] device_ws_size The workspace size in bytes.
-/// \param [out] host_ws_size The host workspace size in bytes. Currently the
-/// value is always zero.
-inline int getrf_scratchpad_size(sycl::queue &q, std::int64_t m, std::int64_t n,
-                                 library_data_t a_type, std::int64_t lda,
-                                 std::size_t *device_ws_size,
-                                 std::size_t *host_ws_size = nullptr) {
-  if (host_ws_size)
-    *host_ws_size = 0;
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::getrf_scratchpad_size_impl>(
-      q, a_type, nullptr, "getrf_scratchpad_size", q, m, n, a_type, lda,
-      device_ws_size_tmp);
-  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
-  return ret;
-}
-
-/// Computes the LU factorization of a general m-by-n matrix.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] m The number of rows in the matrix A.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A. Overwritten by L and U. The unit
-/// diagonal elements of L are not stored.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [out] ipiv The pivot indices. If \p ipiv is nullptr, non-pivoting
-/// LU factorization is computed.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The workspace size in bytes.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int getrf(sycl::queue &q, std::int64_t m, std::int64_t n,
-                 library_data_t a_type, void *a, std::int64_t lda,
-                 std::int64_t *ipiv, void *device_ws,
-                 std::size_t device_ws_size, int *info) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  std::size_t device_ws_size_in_element_number =
-      detail::byte_to_element_number(device_ws_size, a_type);
-  if (ipiv == nullptr) {
-    return detail::lapack_shim<detail::getrfnp_impl>(
-        q, a_type, info, "getrfnp_batch", q, m, n, a_type, a, lda, ipiv,
-        device_ws, device_ws_size_in_element_number, info);
-  }
-  return detail::lapack_shim<detail::getrf_impl>(
-      q, a_type, info, "getrf", q, m, n, a_type, a, lda, ipiv, device_ws,
-      device_ws_size_in_element_number, info);
-#endif
-}
-
-/// Solves a system of linear equations with a LU-factored square coefficient
-/// matrix, with multiple right-hand sides.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] trans Indicates the form of the linear equation.
-/// \param [in] n The order of the matrix A and the number of rows in matrix B.
-/// \param [in] nrhs The number of right hand sides.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] a The input matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] ipiv The pivot indices.
-/// \param [in] b_type The data type of the matrix B.
-/// \param [in, out] b The matrix B, whose columns are the right-hand sides
-/// for the systems of equations.
-/// \param [in] ldb The leading dimension of the matrix B.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int getrs(sycl::queue &q, oneapi::mkl::transpose trans, std::int64_t n,
-                 std::int64_t nrhs, library_data_t a_type, void *a,
-                 std::int64_t lda, std::int64_t *ipiv, library_data_t b_type,
-                 void *b, std::int64_t ldb, int *info) {
-  return detail::lapack_shim<detail::getrs_impl>(
-      q, a_type, info, "getrs_scratchpad_size/getrs", q, trans, n, nrhs, a_type,
-      a, lda, ipiv, b_type, b, ldb, info);
-}
-
-/// Computes the size of workspace memory of geqrf function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] m The number of rows in the matrix A.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [out] device_ws_size The device workspace size in bytes.
-/// \param [out] host_ws_size The host workspace size in bytes. Currently the
-/// value is always zero.
-inline int geqrf_scratchpad_size(sycl::queue &q, std::int64_t m, std::int64_t n,
-                                 library_data_t a_type, std::int64_t lda,
-                                 std::size_t *device_ws_size,
-                                 std::size_t *host_ws_size = nullptr) {
-  if (host_ws_size)
-    *host_ws_size = 0;
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::geqrf_scratchpad_size_impl>(
-      q, a_type, nullptr, "geqrf_scratchpad_size", q, m, n, a_type, lda,
-      device_ws_size_tmp);
-  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
-  return ret;
-}
-
-/// Computes the QR factorization of a general m-by-n matrix.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] m The number of rows in the matrix A.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A. Overwritten by the factorization data.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] tau_type The data type of the array tau.
-/// \param [in] tau The array contains scalars that define elementary reflectors
-/// for the matrix Q in its decomposition in a product of elementary reflectors.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The workspace size in bytes.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int geqrf(sycl::queue &q, std::int64_t m, std::int64_t n,
-                 library_data_t a_type, void *a, std::int64_t lda,
-                 library_data_t tau_type, void *tau, void *device_ws,
-                 std::size_t device_ws_size, int *info) {
-  std::size_t device_ws_size_in_element_number =
-      detail::byte_to_element_number(device_ws_size, a_type);
-  return detail::lapack_shim<detail::geqrf_impl>(
-      q, a_type, info, "geqrf", q, m, n, a_type, a, lda, tau_type, tau,
-      device_ws, device_ws_size_in_element_number, info);
-}
-
-/// Computes the size of workspace memory of gesvd function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobu Must be 'A' (representing jobsvd::vectors), 'S'
-/// (representing jobsvd::somevec), 'O' (representing jobsvd::vectorsina) or 'N'
-/// (representing jobsvd::novec).
-/// \param [in] jobvt Must be 'A' (representing jobsvd::vectors), 'S'
-/// (representing jobsvd::somevec), 'O' (representing jobsvd::vectorsina) or 'N'
-/// (representing jobsvd::novec).
-/// \param [in] m The number of rows in the matrix A.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] u_type The data type of the matrix U.
-/// \param [in] ldu The leading dimension of the matrix U.
-/// \param [in] vt_type The data type of the matrix VT.
-/// \param [in] ldvt The leading dimension of the matrix VT.
-/// \param [out] device_ws_size The device workspace size in bytes.
-/// \param [out] host_ws_size The host workspace size in bytes. Currently the
-/// value is always zero.
-inline int gesvd_scratchpad_size(sycl::queue &q, signed char jobu,
-                                 signed char jobvt, std::int64_t m,
-                                 std::int64_t n, library_data_t a_type,
-                                 std::int64_t lda, library_data_t u_type,
-                                 std::int64_t ldu, library_data_t vt_type,
-                                 std::int64_t ldvt, std::size_t *device_ws_size,
-                                 std::size_t *host_ws_size = nullptr) {
-  oneapi::mkl::jobsvd jobu_enum = detail::char2jobsvd(jobu);
-  oneapi::mkl::jobsvd jobvt_enum = detail::char2jobsvd(jobvt);
-  if (host_ws_size)
-    *host_ws_size = 0;
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::gesvd_scratchpad_size_impl>(
-      q, a_type, nullptr, "gesvd_scratchpad_size", q, jobu_enum, jobvt_enum, m,
-      n, a_type, lda, u_type, ldu, vt_type, ldvt, device_ws_size_tmp);
-  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
-  return ret;
-}
-
-/// Computes the size of workspace memory of gesvd function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::vec or job::novec
-/// \param [in] all_vec Only have effects when \param jobz is job::vec.If the
-/// value is zero, all m columns of U are returned in the matrix U, otherwise
-/// the first min( \param m, \param n ) columns of U (the left singular vectors)
-/// are returned in the matrix U.
-/// \param [in] m The number of rows in the matrix A.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] u_type The data type of the matrix U.
-/// \param [in] ldu The leading dimension of the matrix U.
-/// \param [in] vt_type The data type of the matrix VT.
-/// \param [in] ldvt The leading dimension of the matrix VT.
-/// \param [out] device_ws_size The device workspace size as a number of
-/// elements of type \param a_type.
-/// \param [out] host_ws_size The host workspace size as a number of elements
-/// of type \param a_type. Currently the value is always zero.
-inline int gesvd_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
-                                 std::int64_t all_vec, std::int64_t m,
-                                 std::int64_t n, library_data_t a_type,
-                                 std::int64_t lda, library_data_t u_type,
-                                 std::int64_t ldu, library_data_t vt_type,
-                                 std::int64_t ldvt, int *device_ws_size,
-                                 std::size_t *host_ws_size = nullptr) {
-  if (host_ws_size)
-    *host_ws_size = 0;
-  oneapi::mkl::jobsvd jobu;
-  oneapi::mkl::jobsvd jobvt;
-  if (jobz == oneapi::mkl::job::vec) {
-    if (all_vec) {
-      jobu = jobvt = oneapi::mkl::jobsvd::somevec;
-    } else {
-      jobu = jobvt = oneapi::mkl::jobsvd::vectors;
-    }
-  } else if (jobz == oneapi::mkl::job::novec) {
-    jobu = jobvt = oneapi::mkl::jobsvd::novec;
-  } else {
-    throw std::runtime_error("the job type is unsupported");
-  }
-  std::size_t device_ws_size_64;
-  int ret = detail::lapack_shim<detail::gesvd_scratchpad_size_impl>(
-      q, a_type, nullptr, "gesvd_scratchpad_size", q, jobu, jobvt, m, n, a_type,
-      lda, u_type, ldu, vt_type, ldvt, device_ws_size_64);
-  *device_ws_size = device_ws_size_64;
-  return ret;
-}
-
-/// Computes the size of workspace memory of gesvd function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobu Must be 'A' (representing jobsvd::vectors), 'S'
-/// (representing jobsvd::somevec), 'O' (representing jobsvd::vectorsina) or 'N'
-/// (representing jobsvd::novec).
-/// \param [in] jobvt Must be 'A' (representing jobsvd::vectors), 'S'
-/// (representing jobsvd::somevec), 'O' (representing jobsvd::vectorsina) or 'N'
-/// (representing jobsvd::novec).
-/// \param [in] m The number of rows in the matrix A.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A and it will be overwritten according
-/// to \p jobu and \p jobvt.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] s_type The data type of the matrix S.
-/// \param [out] s The output matrix S.
-/// \param [in] u_type The data type of the matrix U.
-/// \param [out] u The output matrix U.
-/// \param [in] ldu The leading dimension of the matrix U.
-/// \param [in] vt_type The data type of the matrix VT.
-/// \param [out] vt The output matrix VT.
-/// \param [in] ldvt The leading dimension of the matrix VT.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The workspace size in bytes.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int gesvd(sycl::queue &q, signed char jobu, signed char jobvt,
-                 std::int64_t m, std::int64_t n, library_data_t a_type, void *a,
-                 std::int64_t lda, library_data_t s_type, void *s,
-                 library_data_t u_type, void *u, std::int64_t ldu,
-                 library_data_t vt_type, void *vt, std::int64_t ldvt,
-                 void *device_ws, std::size_t device_ws_size, int *info) {
-  oneapi::mkl::jobsvd jobu_enum = detail::char2jobsvd(jobu);
-  oneapi::mkl::jobsvd jobvt_enum = detail::char2jobsvd(jobvt);
-  std::size_t device_ws_size_in_element_number =
-      detail::byte_to_element_number(device_ws_size, a_type);
-  return detail::lapack_shim<detail::gesvd_impl>(
-      q, a_type, info, "gesvd", q, jobu_enum, jobvt_enum, m, n, a_type, a, lda,
-      s_type, s, u_type, u, ldu, vt_type, vt, ldvt, device_ws,
-      device_ws_size_in_element_number, info);
-}
-
-/// Computes the size of workspace memory of gesvd function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::vec or job::novec.
-/// \param [in] all_vec Only have effects when \param jobz is job::vec.If the
-/// value is zero, all m columns of U are returned in the matrix U, otherwise
-/// the first min( \param m, \param n ) columns of U (the left singular vectors)
-/// are returned in the matrix U.
-/// \param [in] m The number of rows in the matrix A.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A and it will be overwritten according
-/// to \p jobu and \p jobvt.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] s_type The data type of the matrix S.
-/// \param [out] s The output matrix S.
-/// \param [in] u_type The data type of the matrix U.
-/// \param [out] u The output matrix U.
-/// \param [in] ldu The leading dimension of the matrix U.
-/// \param [in] vt_type The data type of the matrix VT.
-/// \param [out] vt The output matrix VT.
-/// \param [in] ldvt The leading dimension of the matrix VT.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The device workspace size as a number of
-/// elements of type \param a_type.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int gesvd(sycl::queue &q, oneapi::mkl::job jobz, std::int64_t all_vec,
-                 std::int64_t m, std::int64_t n, library_data_t a_type, void *a,
-                 std::int64_t lda, library_data_t s_type, void *s,
-                 library_data_t u_type, void *u, std::int64_t ldu,
-                 library_data_t vt_type, void *vt, std::int64_t ldvt,
-                 void *device_ws, std::size_t device_ws_size, int *info) {
-  oneapi::mkl::jobsvd jobu;
-  oneapi::mkl::jobsvd jobvt;
-  if (jobz == oneapi::mkl::job::vec) {
-    if (all_vec) {
-      jobu = jobvt = oneapi::mkl::jobsvd::somevec;
-    } else {
-      jobu = jobvt = oneapi::mkl::jobsvd::vectors;
-    }
-  } else if (jobz == oneapi::mkl::job::novec) {
-    jobu = jobvt = oneapi::mkl::jobsvd::novec;
-  } else {
-    throw std::runtime_error("the job type is unsupported");
-  }
-
-  detail::lapack_shim<detail::gesvd_conj_impl>(
-      q, a_type, info, "gesvd", q, jobu, jobvt, m, n, a_type, a, lda, s_type, s,
-      u_type, u, ldu, vt_type, vt, ldvt, device_ws, device_ws_size, info);
-  return 0;
-}
-
-/// Computes the size of workspace memory of potrf function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [out] device_ws_size The device workspace size in bytes.
-/// \param [out] host_ws_size The host workspace size in bytes. Currently the
-/// value is always zero.
-inline int potrf_scratchpad_size(sycl::queue &q, oneapi::mkl::uplo uplo,
-                                 std::int64_t n, library_data_t a_type,
-                                 std::int64_t lda, std::size_t *device_ws_size,
-                                 std::size_t *host_ws_size = nullptr) {
-  if (host_ws_size)
-    *host_ws_size = 0;
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::potrf_scratchpad_size_impl>(
-      q, a_type, nullptr, "potrf_scratchpad_size", q, uplo, n, a_type, lda,
-      device_ws_size_tmp);
-  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
-  return ret;
-}
-
-/// Computes the Cholesky factorization of a symmetric (Hermitian)
-/// positive-definite matrix.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The number of columns in the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A. Overwritten by the Cholesky factor U
-/// or L, as specified by \p uplo.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The workspace size in bytes.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int potrf(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
-                 library_data_t a_type, void *a, std::int64_t lda,
-                 void *device_ws, std::size_t device_ws_size, int *info) {
-  std::size_t device_ws_size_in_element_number =
-      detail::byte_to_element_number(device_ws_size, a_type);
-  return detail::lapack_shim<detail::potrf_impl>(
-      q, a_type, info, "potrf", q, uplo, n, a_type, a, lda, device_ws,
-      device_ws_size_in_element_number, info);
-}
-
-/// Solves a system of linear equations with a Cholesky-factored symmetric
-/// (Hermitian) positive-definite coefficient matrix.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A and the number of rows in matrix B.
-/// \param [in] nrhs The number of right hand sides.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A. Overwritten by the Cholesky factor U
-/// or L, as specified by \p uplo.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] b_type The data type of the matrix B.
-/// \param [in, out] b The matrix B, whose columns are the right-hand sides
-/// for the systems of equations.
-/// \param [in] ldb The leading dimension of the matrix B.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int potrs(sycl::queue &q, oneapi::mkl::uplo uplo, std::int64_t n,
-                 std::int64_t nrhs, library_data_t a_type, void *a,
-                 std::int64_t lda, library_data_t b_type, void *b,
-                 std::int64_t ldb, int *info) {
-  return detail::lapack_shim<detail::potrs_impl>(
-      q, a_type, info, "potrs_scratchpad_size/potrs", q, uplo, n, nrhs, a_type,
-      a, lda, b_type, b, ldb, info);
-}
-
-/// Computes the size of workspace memory of syevx/heevx function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] vl If range == rangev::values, the lower bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] vu If range == rangev::values, the upper bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] il If range == rangev::indices, the indices of the smallest
-/// eigenvalue to be returned.
-/// \param [in] iu If range == rangev::indices, the indices of the largest
-/// eigenvalue to be returned.
-/// \param [in] w_type The data type of the eigenvalues.
-/// \param [out] device_ws_size The device workspace size in bytes.
-/// \param [out] host_ws_size The host workspace size in bytes. Currently the
-/// value is always zero.
-inline int syheevx_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
-                                   oneapi::mkl::rangev range,
-                                   oneapi::mkl::uplo uplo, std::int64_t n,
-                                   library_data_t a_type, std::int64_t lda,
-                                   void *vl, void *vu, std::int64_t il,
-                                   std::int64_t iu, library_data_t w_type,
-                                   std::size_t *device_ws_size,
-                                   std::size_t *host_ws_size = nullptr) {
-  if (host_ws_size)
-    *host_ws_size = 0;
-  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::syheevx_scratchpad_size_impl>(
-      q, a_type, nullptr, "syevx_scratchpad_size/heevx_scratchpad_size", q,
-      compz_jobz, range, uplo, n, lda, vl, vu, il, iu,
-      device_ws_size_tmp);
-  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
-  return ret;
-}
-
-/// Computes selected eigenvalues and, optionally, eigenvectors of a
-/// symmetric/Hermitian matrix.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A. On exit, the lower or upper triangle is
-/// overwritten.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] vl If range == rangev::values, the lower bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] vu If range == rangev::values, the upper bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] il If range == rangev::indices, the indices of the smallest
-/// eigenvalue to be returned.
-/// \param [in] iu If range == rangev::indices, the indices of the largest
-/// eigenvalue to be returned.
-/// \param [out] m The total number of eigenvalues found.
-/// \param [in] w_type The data type of the eigenvalues.
-/// \param [out] w The eigenvalues of the matrix A in ascending order.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The workspace size in bytes.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int syheevx(sycl::queue &q, oneapi::mkl::job jobz,
-                   oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
-                   std::int64_t n, library_data_t a_type, void *a,
-                   std::int64_t lda, void *vl, void *vu, std::int64_t il,
-                   std::int64_t iu, std::int64_t *m, library_data_t w_type,
-                   void *w, void *device_ws, std::size_t device_ws_size,
-                   int *info) {
-  std::size_t device_ws_size_in_element_number =
-      detail::byte_to_element_number(device_ws_size, a_type);
-  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
-  int ret = detail::lapack_shim<detail::syheevx_impl>(
-      q, a_type, info, "syevx/heevx", q, compz_jobz, range, uplo, n, a_type, a,
-      lda, vl, vu, il, iu, m, w_type, w, device_ws,
-      device_ws_size_in_element_number, info);
-  q.wait();
-  return ret;
-}
-
-/// Computes the size of workspace memory of syevx/heevx function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] vl If range == rangev::values, the lower bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] vu If range == rangev::values, the upper bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] il If range == rangev::indices, the indices of the smallest
-/// eigenvalue to be returned.
-/// \param [in] iu If range == rangev::indices, the indices of the largest
-/// eigenvalue to be returned.
-/// \param [out] device_ws_size The device workspace size as a number of
-/// elements of type \tparam T.
-template <typename T, typename ValueT>
-inline int syheevx_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
-                                   oneapi::mkl::rangev range,
-                                   oneapi::mkl::uplo uplo, int n, int lda,
-                                   ValueT vl, ValueT vu, int il, int iu,
-                                   int *device_ws_size) {
-  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::syheevx_scratchpad_size_impl>(
-      q, detail::get_library_data_t_from_type<T>(), nullptr,
-      "syevx_scratchpad_size/heevx_scratchpad_size", q, compz_jobz, range, uplo,
-      n, lda, &vl, &vu, il, iu, device_ws_size_tmp);
-  *device_ws_size = (int)device_ws_size_tmp;
-  return ret;
-}
-
-/// Computes selected eigenvalues and, optionally, eigenvectors of a
-/// symmetric/Hermitian matrix.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in, out] a The input matrix A. On exit, the lower or upper triangle is
-/// overwritten.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] vl If range == rangev::values, the lower bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] vu If range == rangev::values, the upper bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] il If range == rangev::indices, the indices of the smallest
-/// eigenvalue to be returned.
-/// \param [in] iu If range == rangev::indices, the indices of the largest
-/// eigenvalue to be returned.
-/// \param [out] m The total number of eigenvalues found.
-/// \param [out] w The eigenvalues of the matrix A in ascending order.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The device workspace size as a number of
-/// elements of type \tparam T.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-template <typename T, typename ValueT>
-inline int syheevx(sycl::queue &q, oneapi::mkl::job jobz,
-                   oneapi::mkl::rangev range, oneapi::mkl::uplo uplo, int n,
-                   T *a, int lda, ValueT vl, ValueT vu, int il, int iu, int *m,
-                   ValueT *w, T *device_ws, int device_ws_size, int *info) {
-  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
-  std::int64_t m64;
-  int ret = detail::lapack_shim<detail::syheevx_impl>(
-      q, detail::get_library_data_t_from_type<T>(), info, "syevx/heevx", q,
-      compz_jobz, range, uplo, n, detail::get_library_data_t_from_type<T>(), a,
-      lda, &vl, &vu, il, iu, &m64,
-      detail::get_library_data_t_from_type<ValueT>(), w, device_ws,
-      device_ws_size, info);
-  q.wait();
-  *m = (int)m64;
-  return ret;
-}
-
-/// Computes the size of workspace memory of sygvx/hegvx function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] itype Must be 1, 2 or 3.
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] ldb The leading dimension of the matrix B.
-/// \param [in] vl If range == rangev::values, the lower bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] vu If range == rangev::values, the upper bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] il If range == rangev::indices, the indices of the smallest
-/// eigenvalue to be returned.
-/// \param [in] iu If range == rangev::indices, the indices of the largest
-/// eigenvalue to be returned.
-/// \param [in] device_ws_size The device workspace size as a number of
-/// elements of type \tparam T.
-template <typename T, typename ValueT>
-inline int
-syhegvx_scratchpad_size(sycl::queue &q, int itype, oneapi::mkl::job jobz,
-                        oneapi::mkl::rangev range, oneapi::mkl::uplo uplo,
-                        int n, int lda, int ldb, ValueT vl, ValueT vu, int il,
-                        int iu, int *device_ws_size) {
-  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::syhegvx_scratchpad_size_impl>(
-      q, detail::get_library_data_t_from_type<T>(), nullptr,
-      "sygvx_scratchpad_size/hegvx_scratchpad_size", q, itype, compz_jobz,
-      range, uplo, n, lda, ldb, &vl, &vu, il, iu, device_ws_size_tmp);
-  *device_ws_size = (int)device_ws_size_tmp;
-  return ret;
-}
-
-/// Computes selected eigenvalues and, optionally, eigenvectors of a real
-/// generalized symmetric/Hermitian definite eigenproblem.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] itype Must be 1, 2 or 3.
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] range Must be rangev::all, rangev::values or uplo::indices.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in, out] a The input matrix A. On exit, the lower or upper triangle is
-/// overwritten.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in, out] b The input matrix B.
-/// \param [in] ldb The leading dimension of the matrix B.
-/// \param [in] vl If range == rangev::values, the lower bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] vu If range == rangev::values, the upper bound of the interval
-/// to be searched for eigenvalues
-/// \param [in] il If range == rangev::indices, the indices of the smallest
-/// eigenvalue to be returned.
-/// \param [in] iu If range == rangev::indices, the indices of the largest
-/// eigenvalue to be returned.
-/// \param [out] m The total number of eigenvalues found.
-/// \param [out] w The eigenvalues of the matrix A in ascending order.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The device workspace size as a number of
-/// elements of type \tparam T.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-template <typename T, typename ValueT>
-inline int syhegvx(sycl::queue &q, int itype, oneapi::mkl::job jobz,
-                   oneapi::mkl::rangev range, oneapi::mkl::uplo uplo, int n,
-                   T *a, int lda, T *b, int ldb, ValueT vl, ValueT vu, int il,
-                   int iu, int *m, ValueT *w, T *device_ws, int device_ws_size,
-                   int *info) {
-  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
-  std::int64_t m64;
-  int ret = detail::lapack_shim<detail::syhegvx_impl>(
-      q, detail::get_library_data_t_from_type<T>(), info, "sygvx/hegvx", q,
-      itype, compz_jobz, range, uplo, n, a, lda, b, ldb, &vl, &vu, il, iu, &m64,
-      w, device_ws, device_ws_size, info);
-  q.wait();
-  *m = (int)m64;
-  return ret;
-}
-
-/// Computes the size of workspace memory of sygvd/hegvd function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] itype Must be 1, 2 or 3.
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] ldb The leading dimension of the matrix B.
-/// \param [in] device_ws_size The device workspace size as a number of
-/// elements of type \tparam T.
-template <typename T>
-inline int syhegvd_scratchpad_size(sycl::queue &q, int itype,
-                                   oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, int n, int lda,
-                                   int ldb, int *device_ws_size) {
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::syhegvd_scratchpad_size_impl>(
-      q, detail::get_library_data_t_from_type<T>(), nullptr,
-      "sygvd_scratchpad_size/hegvd_scratchpad_size", q, itype, jobz, uplo, n,
-      lda, ldb, device_ws_size_tmp);
-  *device_ws_size = (int)device_ws_size_tmp;
-  return ret;
-}
-
-/// Computes all eigenvalues and, optionally, eigenvectors of a real generalized
-/// symmetric/Hermitian definite eigenproblem using a divide and conquer method.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] itype Must be 1, 2 or 3.
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in, out] a The input matrix A. On exit, it is overwritten by eigenvectors.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in, out] b The input matrix B.
-/// \param [in] ldb The leading dimension of the matrix B.
-/// \param [out] w The eigenvalues of the matrix A in ascending order.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The device workspace size as a number of
-/// elements of type \tparam T.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-template <typename T, typename ValueT>
-inline int syhegvd(sycl::queue &q, int itype, oneapi::mkl::job jobz,
-                   oneapi::mkl::uplo uplo, int n, T *a, int lda, T *b, int ldb,
-                   ValueT *w, T *device_ws, int device_ws_size, int *info) {
-  return detail::lapack_shim<detail::syhegvd_impl>(
-      q, detail::get_library_data_t_from_type<T>(), info, "sygvd/hegvd", q,
-      itype, jobz, uplo, n, a, lda, b, ldb, w, device_ws, device_ws_size, info);
-}
-
-/// Computes the size of workspace memory of syev/heev function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [out] device_ws_size The device workspace size as a number of
-/// elements of type \tparam T.
-template <typename T>
-inline int syheev_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
-                                  oneapi::mkl::uplo uplo, int n, int lda,
-                                  int *device_ws_size) {
-  std::size_t device_ws_size_tmp;
-  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
-  int ret = detail::lapack_shim<detail::syheev_scratchpad_size_impl>(
-      q, detail::get_library_data_t_from_type<T>(), nullptr,
-      "syev_scratchpad_size/heev_scratchpad_size", q, compz_jobz, uplo, n, lda,
-      device_ws_size_tmp);
-  *device_ws_size = (int)device_ws_size_tmp;
-  return ret;
-}
-
-/// Computes all eigenvalues and, optionally, eigenvectors of a real symmetric
-/// or Hermitian matrix.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in, out] a The input matrix A. On exit, it is overwritten by
-/// eigenvectors.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [out] w The eigenvalues of the matrix A in ascending order.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The device workspace size as a number of
-/// elements of type \tparam T.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-template <typename T, typename ValueT>
-inline int syheev(sycl::queue &q, oneapi::mkl::job jobz, oneapi::mkl::uplo uplo,
-                  int n, T *a, int lda, ValueT *w, T *device_ws,
-                  int device_ws_size, int *info) {
-  oneapi::mkl::compz compz_jobz = detail::job2compz(jobz);
-  return detail::lapack_shim<detail::syheev_impl>(
-      q, detail::get_library_data_t_from_type<T>(), info, "syev/heev", q,
-      compz_jobz, uplo, n, a, lda, w, device_ws, device_ws_size, info);
-}
-
-/// Computes the size of workspace memory of syevd/heevd function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] w_type The data type of the eigenvalues.
-/// \param [out] device_ws_size The device workspace in bytes.
-/// \param [out] host_ws_size The host workspace size in bytes. Currently the
-/// value is always zero.
-inline int syheevd_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n,
-                                   library_data_t a_type, std::int64_t lda,
-                                   library_data_t w_type,
-                                   std::size_t *device_ws_size,
-                                   std::size_t *host_ws_size = nullptr) {
-  if (host_ws_size)
-    *host_ws_size = 0;
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::syheevd_scratchpad_size_impl>(
-      q, a_type, nullptr, "syevd_scratchpad_size/heevd_scratchpad_size", q,
-      jobz, uplo, n, a_type, lda, device_ws_size_tmp);
-  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
-  return ret;
-}
-
-/// Computes all eigenvalues and, optionally, all eigenvectors of a real
-/// symmetric or Hermitian matrix using divide and conquer algorithm.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A. On exit, it is overwritten by
-/// eigenvectors.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] w_type The data type of the eigenvalues.
-/// \param [out] w The eigenvalues of the matrix A in ascending order.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The workspace size in bytes.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int syheevd(sycl::queue &q, oneapi::mkl::job jobz,
-                   oneapi::mkl::uplo uplo, std::int64_t n,
-                   library_data_t a_type, void *a, std::int64_t lda,
-                   library_data_t w_type, void *w, void *device_ws,
-                   std::size_t device_ws_size, int *info) {
-  std::size_t device_ws_size_in_element_number =
-      detail::byte_to_element_number(device_ws_size, a_type);
-  return detail::lapack_shim<detail::syheevd_impl>(
-      q, a_type, info, "syevd/heevd", q, jobz, uplo, n, a_type, a, lda, w,
-      device_ws, device_ws_size_in_element_number, info);
-}
-
-/// Computes the size of workspace memory of syevd/heevd function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] w_type The data type of the eigenvalues.
-/// \param [out] device_ws_size The device workspace size as a number of
-/// elements of type \tparam T.
-template <typename T>
-inline int syheevd_scratchpad_size(sycl::queue &q, oneapi::mkl::job jobz,
-                                   oneapi::mkl::uplo uplo, std::int64_t n,
-                                   std::int64_t lda, int *device_ws_size) {
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::syheevd_scratchpad_size_impl>(
-      q, detail::get_library_data_t_from_type<T>(), nullptr,
-      "syevd_scratchpad_size/heevd_scratchpad_size", q, jobz, uplo, n,
-      detail::get_library_data_t_from_type<T>(), lda, device_ws_size_tmp);
-  *device_ws_size = (int)device_ws_size_tmp;
-  return ret;
-}
-
-/// Computes all eigenvalues and, optionally, all eigenvectors of a real
-/// symmetric or Hermitian matrix using divide and conquer algorithm.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] jobz Must be job::novec or job::vec.
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] n The order of the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A. On exit, it is overwritten by
-/// eigenvectors.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] w_type The data type of the eigenvalues.
-/// \param [out] w The eigenvalues of the matrix A in ascending order.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The workspace size as a number of
-/// elements of type \tparam T.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-template <typename T, typename ValueT>
-inline int syheevd(sycl::queue &q, oneapi::mkl::job jobz,
-                   oneapi::mkl::uplo uplo, std::int64_t n, T *a,
-                   std::int64_t lda, ValueT *w, T *device_ws,
-                   int device_ws_size, int *info) {
-  return detail::lapack_shim<detail::syheevd_impl>(
-      q, detail::get_library_data_t_from_type<T>(), info, "syevd/heevd", q,
-      jobz, uplo, n, detail::get_library_data_t_from_type<T>(), a, lda, w,
-      device_ws, device_ws_size, info);
-}
-
-/// Computes the size of workspace memory of trtri function.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] diag Must be diag::nonunit or diag::unit.
-/// \param [in] n The order of the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [out] device_ws_size The device workspace in bytes.
-/// \param [out] host_ws_size The host workspace size in bytes. Currently the
-/// value is always zero.
-inline int trtri_scratchpad_size(sycl::queue &q, oneapi::mkl::uplo uplo,
-                                 oneapi::mkl::diag diag, std::int64_t n,
-                                 library_data_t a_type, std::int64_t lda,
-                                 std::size_t *device_ws_size,
-                                 std::size_t *host_ws_size = nullptr) {
-  if (host_ws_size)
-    *host_ws_size = 0;
-  std::size_t device_ws_size_tmp;
-  int ret = detail::lapack_shim<detail::trtri_scratchpad_size_impl>(
-      q, a_type, nullptr, "trtri_scratchpad_size", q, uplo, diag, n, a_type,
-      lda, device_ws_size_tmp);
-  *device_ws_size = detail::element_number_to_byte(device_ws_size_tmp, a_type);
-  return ret;
-}
-
-/// Computes the inverse of a triangular matrix.
-/// \return Returns 0 if no synchronous exception, otherwise returns 1.
-/// \param [in] q Device queue where computation will be performed. It must
-/// have the in_order property when using the USM mode (DPCT_USM_LEVEL_NONE is
-/// not defined).
-/// \param [in] uplo Must be uplo::upper or uplo::lower.
-/// \param [in] diag Must be diag::nonunit or diag::unit.
-/// \param [in] n The order of the matrix A.
-/// \param [in] a_type The data type of the matrix A.
-/// \param [in, out] a The input matrix A. On exit, it is overwritten by
-/// the inverse matrix of A.
-/// \param [in] lda The leading dimension of the matrix A.
-/// \param [in] device_ws The workspace.
-/// \param [in] device_ws_size The workspace size in bytes.
-/// \param [out] info If lapack synchronous exception is caught, the value
-/// returned from info() method of the exception is set to \p info.
-inline int trtri(sycl::queue &q, oneapi::mkl::uplo uplo, oneapi::mkl::diag diag,
-                 std::int64_t n, library_data_t a_type, void *a,
-                 std::int64_t lda, void *device_ws, std::size_t device_ws_size,
-                 int *info) {
-#ifdef DPCT_USM_LEVEL_NONE
-  throw std::runtime_error("this API is unsupported when USM level is none");
-#else
-  std::size_t device_ws_size_in_element_number =
-      detail::byte_to_element_number(device_ws_size, a_type);
-  return detail::lapack_shim<detail::trtri_impl>(
-      q, a_type, info, "trtri", q, uplo, diag, n, a_type, a, lda, device_ws,
-      device_ws_size_in_element_number, info);
-#endif
-}
-} // namespace lapack
-} // namespace dpct
-
-#endif // __DPCT_LAPACK_UTILS_HPP__
diff --git a/dpct/lib_common_utils.hpp b/dpct/lib_common_utils.hpp
deleted file mode 100644
index 62366da986cdc..0000000000000
--- a/dpct/lib_common_utils.hpp
+++ /dev/null
@@ -1,174 +0,0 @@
-//==---- lib_common_utils.hpp ---------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_LIB_COMMON_UTILS_HPP__
-#define __DPCT_LIB_COMMON_UTILS_HPP__
-
-#include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
-#include "memory.hpp"
-#include "util.hpp"
-
-namespace dpct {
-namespace detail {
-template <typename T> inline auto get_memory(const void *x) {
-  T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
-#ifdef DPCT_USM_LEVEL_NONE
-  return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
-#else
-  return new_x;
-#endif
-}
-
-template <typename T>
-inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q) {
-  using Ty = typename DataType<T>::T2;
-  Ty s_h;
-  if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
-    detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
-        .wait();
-  else
-    s_h = *reinterpret_cast<const Ty *>(s);
-  return s_h;
-}
-} // namespace detail
-
-enum class version_field : int { major, minor, update, patch };
-
-/// Returns the requested field of Intel(R) oneAPI Math Kernel Library version.
-/// \param field The version information field (major, minor, update or patch).
-/// \param result The result value.
-inline void mkl_get_version(version_field field, int *result) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  MKLVersion version;
-  mkl_get_version(&version);
-  if (version_field::major == field) {
-    *result = version.MajorVersion;
-  } else if (version_field::minor == field) {
-    *result = version.MinorVersion;
-  } else if (version_field::update == field) {
-    *result = version.UpdateVersion;
-  } else if (version_field::patch == field) {
-    *result = 0;
-  } else {
-    throw std::runtime_error("unknown field");
-  }
-#endif
-}
-
-enum class library_data_t : unsigned char {
-  real_float = 0,
-  complex_float,
-  real_double,
-  complex_double,
-  real_half,
-  complex_half,
-  real_bfloat16,
-  complex_bfloat16,
-  real_int4,
-  complex_int4,
-  real_uint4,
-  complex_uint4,
-  real_int8,
-  complex_int8,
-  real_uint8,
-  complex_uint8,
-  real_int16,
-  complex_int16,
-  real_uint16,
-  complex_uint16,
-  real_int32,
-  complex_int32,
-  real_uint32,
-  complex_uint32,
-  real_int64,
-  complex_int64,
-  real_uint64,
-  complex_uint64,
-  real_int8_4,
-  real_int8_32,
-  real_uint8_4,
-  library_data_t_size
-};
-
-namespace detail {
-template <typename ArgT>
-inline constexpr std::uint64_t get_type_combination_id(ArgT Val) {
-  static_assert((unsigned char)library_data_t::library_data_t_size <=
-                    std::numeric_limits<unsigned char>::max() &&
-                "library_data_t size exceeds limit.");
-  static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
-  return (std::uint64_t)Val;
-}
-
-template <typename FirstT, typename... RestT>
-inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
-                                                       RestT... RestVal) {
-  static_assert((std::uint8_t)library_data_t::library_data_t_size <=
-                    std::numeric_limits<unsigned char>::max() &&
-                "library_data_t size exceeds limit.");
-  static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
-  static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
-  return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
-}
-
-inline constexpr std::size_t library_data_size[] = {
-    8 * sizeof(float),                    // real_float
-    8 * sizeof(std::complex<float>),      // complex_float
-    8 * sizeof(double),                   // real_double
-    8 * sizeof(std::complex<double>),     // complex_double
-    8 * sizeof(sycl::half),               // real_half
-    8 * sizeof(std::complex<sycl::half>), // complex_half
-    16,                                   // real_bfloat16
-    16 * 2,                               // complex_bfloat16
-    4,                                    // real_int4
-    4 * 2,                                // complex_int4
-    4,                                    // real_uint4
-    4 * 2,                                // complex_uint4
-    8,                                    // real_int8
-    8 * 2,                                // complex_int8
-    8,                                    // real_uint8
-    8 * 2,                                // complex_uint8
-    16,                                   // real_int16
-    16 * 2,                               // complex_int16
-    16,                                   // real_uint16
-    16 * 2,                               // complex_uint16
-    32,                                   // real_int32
-    32 * 2,                               // complex_int32
-    32,                                   // real_uint32
-    32 * 2,                               // complex_uint32
-    64,                                   // real_int64
-    64 * 2,                               // complex_int64
-    64,                                   // real_uint64
-    64 * 2,                               // complex_uint64
-    8,                                    // real_int8_4
-    8,                                    // real_int8_32
-    8                                     // real_uint8_4
-};
-} // namespace detail
-
-#ifdef DPCT_USM_LEVEL_NONE
-/// Cast a "rvalue reference to a temporary object" to an "lvalue reference to
-/// that temporary object".
-/// CAUTION:
-/// The returned lvalue reference is available only before the last step in
-/// evaluating the full-expression that contains this function call.
-/// \param [in] temporary_object The rvalue reference to a temporary object.
-/// \returns The lvalue reference to that temporary object.
-template <typename T>
-inline typename std::enable_if_t<std::is_rvalue_reference_v<T &&>, T &>
-rvalue_ref_to_lvalue_ref(T &&temporary_object) {
-  return temporary_object;
-}
-#endif
-} // namespace dpct
-
-#endif // __DPCT_LIB_COMMON_UTILS_HPP__
diff --git a/dpct/math.hpp b/dpct/math.hpp
deleted file mode 100644
index 8d400fa682a00..0000000000000
--- a/dpct/math.hpp
+++ /dev/null
@@ -1,1814 +0,0 @@
-//==---- math.hpp ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_MATH_HPP__
-#define __DPCT_MATH_HPP__
-
-#include <limits>
-#include <climits>
-#include <sycl/sycl.hpp>
-#include <type_traits>
-
-namespace dpct {
-namespace detail {
-template <typename VecT, class BinaryOperation, class = void>
-class vectorized_binary {
-public:
-  inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) {
-    VecT v4;
-    for (size_t i = 0; i < v4.size(); ++i) {
-      v4[i] = binary_op(a[i], b[i]);
-    }
-    return v4;
-  }
-};
-template <typename VecT, class BinaryOperation>
-class vectorized_binary<
-    VecT, BinaryOperation,
-    std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>> {
-public:
-  inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op) {
-    return binary_op(a, b).template as<VecT>();
-  }
-};
-
-template <typename T> inline bool isnan(const T a) { return sycl::isnan(a); }
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-inline bool isnan(const sycl::ext::oneapi::bfloat16 a) {
-  return sycl::ext::oneapi::experimental::isnan(a);
-}
-#endif
-} // namespace detail
-
-/// Compute fast_length for variable-length array
-/// \param [in] a The array
-/// \param [in] len Length of the array
-/// \returns The computed fast_length
-inline float fast_length(const float *a, int len) {
-  switch (len) {
-  case 1:
-    return a[0];
-  case 2:
-    return sycl::fast_length(sycl::float2(a[0], a[1]));
-  case 3:
-    return sycl::fast_length(sycl::float3(a[0], a[1], a[2]));
-  case 4:
-    return sycl::fast_length(sycl::float4(a[0], a[1], a[2], a[3]));
-  case 0:
-    return 0;
-  default:
-    float f = 0;
-    for (int i = 0; i < len; ++i)
-      f += a[i] * a[i];
-    return sycl::sqrt(f);
-  }
-}
-
-/// Calculate the square root of the input array.
-/// \param [in] a The array pointer
-/// \param [in] len Length of the array
-/// \returns The square root
-template <typename T> inline T length(const T *a, const int len) {
-  switch (len) {
-  case 1:
-    return a[0];
-  case 2:
-    return sycl::length(sycl::vec<T, 2>(a[0], a[1]));
-  case 3:
-    return sycl::length(sycl::vec<T, 3>(a[0], a[1], a[2]));
-  case 4:
-    return sycl::length(sycl::vec<T, 4>(a[0], a[1], a[2], a[3]));
-  default:
-    T ret = 0;
-    for (int i = 0; i < len; ++i)
-      ret += a[i] * a[i];
-    return sycl::sqrt(ret);
-  }
-}
-
-/// Returns min(max(val, min_val), max_val)
-/// \param [in] val The input value
-/// \param [in] min_val The minimum value
-/// \param [in] max_val The maximum value
-/// \returns the value between min_val and max_val
-template <typename T> inline T clamp(T val, T min_val, T max_val) {
-  return sycl::clamp(val, min_val, max_val);
-}
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-template <>
-inline sycl::ext::oneapi::bfloat16 clamp(sycl::ext::oneapi::bfloat16 val,
-                                         sycl::ext::oneapi::bfloat16 min_val,
-                                         sycl::ext::oneapi::bfloat16 max_val) {
-  if (val < min_val)
-    return min_val;
-  if (val > max_val)
-    return max_val;
-  return val;
-}
-#endif
-template <typename T>
-inline sycl::marray<T, 2> clamp(sycl::marray<T, 2> val,
-                                sycl::marray<T, 2> min_val,
-                                sycl::marray<T, 2> max_val) {
-  return {clamp(val[0], min_val[0], max_val[0]),
-          clamp(val[1], min_val[1], max_val[1])};
-}
-
-/// Performs comparison.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename T, class BinaryOperation>
-inline std::enable_if_t<
-    std::is_same_v<std::invoke_result_t<BinaryOperation, T, T>, bool>, bool>
-compare(const T a, const T b, const BinaryOperation binary_op) {
-  return binary_op(a, b);
-}
-template <typename T>
-inline std::enable_if_t<
-    std::is_same_v<std::invoke_result_t<std::not_equal_to<>, T, T>, bool>, bool>
-compare(const T a, const T b, const std::not_equal_to<> binary_op) {
-  return !detail::isnan(a) && !detail::isnan(b) && binary_op(a, b);
-}
-
-/// Performs unordered comparison.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename T, class BinaryOperation>
-inline std::enable_if_t<
-    std::is_same_v<std::invoke_result_t<BinaryOperation, T, T>, bool>, bool>
-unordered_compare(const T a, const T b, const BinaryOperation binary_op) {
-  return detail::isnan(a) || detail::isnan(b) || binary_op(a, b);
-}
-
-/// Performs 2 element comparison and return true if both results are true.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename T, class BinaryOperation>
-inline std::enable_if_t<T::size() == 2, bool>
-compare_both(const T a, const T b, const BinaryOperation binary_op) {
-  return compare(a[0], b[0], binary_op) && compare(a[1], b[1], binary_op);
-}
-
-/// Performs 2 element unordered comparison and return true if both results are
-/// true.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename T, class BinaryOperation>
-inline std::enable_if_t<T::size() == 2, bool>
-unordered_compare_both(const T a, const T b, const BinaryOperation binary_op) {
-  return unordered_compare(a[0], b[0], binary_op) &&
-         unordered_compare(a[1], b[1], binary_op);
-}
-
-/// Performs 2 element comparison.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename T, class BinaryOperation>
-inline std::enable_if_t<T::size() == 2, T>
-compare(const T a, const T b, const BinaryOperation binary_op) {
-  return {compare(a[0], b[0], binary_op), compare(a[1], b[1], binary_op)};
-}
-
-/// Performs 2 elements comparison, compare result of each element is 0 (false)
-/// or 0xffff (true), returns an unsigned int by composing compare result of two
-/// elements.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename T, class BinaryOperation>
-inline unsigned compare_mask(const sycl::vec<T, 2> a, const sycl::vec<T, 2> b,
-                             const BinaryOperation binary_op) {
-  return sycl::vec<short, 2>(-compare(a[0], b[0], binary_op),
-                             -compare(a[1], b[1], binary_op))
-      .as<sycl::vec<unsigned, 1>>();
-}
-template <typename T, class BinaryOperation>
-inline unsigned compare_mask(const sycl::marray<T, 2> a,
-                             const sycl::marray<T, 2> b,
-                             const BinaryOperation binary_op) {
-  return sycl::vec<short, 2>(-compare(a[0], b[0], binary_op),
-                             -compare(a[1], b[1], binary_op))
-      .as<sycl::vec<unsigned, 1>>();
-}
-
-/// Performs 2 element unordered comparison.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename T, class BinaryOperation>
-inline std::enable_if_t<T::size() == 2, T>
-unordered_compare(const T a, const T b, const BinaryOperation binary_op) {
-  return {unordered_compare(a[0], b[0], binary_op),
-          unordered_compare(a[1], b[1], binary_op)};
-}
-
-/// Performs 2 elements unordered comparison, compare result of each element is
-/// 0 (false) or 0xffff (true), returns an unsigned int by composing compare
-/// result of two elements.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] binary_op functor that implements the binary operation
-/// \returns the comparison result
-template <typename T, class BinaryOperation>
-inline unsigned unordered_compare_mask(const sycl::vec<T, 2> a,
-                                       const sycl::vec<T, 2> b,
-                                       const BinaryOperation binary_op) {
-  return sycl::vec<short, 2>(-unordered_compare(a[0], b[0], binary_op),
-                             -unordered_compare(a[1], b[1], binary_op))
-      .as<sycl::vec<unsigned, 1>>();
-}
-template <typename T, class BinaryOperation>
-inline unsigned unordered_compare_mask(const sycl::marray<T, 2> a,
-                                       const sycl::marray<T, 2> b,
-                                       const BinaryOperation binary_op) {
-  return sycl::vec<short, 2>(-unordered_compare(a[0], b[0], binary_op),
-                             -unordered_compare(a[1], b[1], binary_op))
-      .as<sycl::vec<unsigned, 1>>();
-}
-
-/// Bitfield-extract.
-///
-/// \tparam T The type of \param source value, must be an integer.
-/// \param source The source value to extracting.
-/// \param bit_start The position to start extracting.
-/// \param num_bits The number of bits to extracting.
-template <typename T>
-inline std::enable_if_t<std::is_unsigned_v<T>, T>
-bfe(const T source, const uint32_t bit_start, const uint32_t num_bits) {
-  const T mask = (T{1} << num_bits) - 1;
-  return (source >> bit_start) & mask;
-}
-
-/// Bitfield-extract with boundary checking.
-///
-/// Extract bit field from \param source and return the zero or sign-extended
-/// result. Source \param bit_start gives the bit field starting bit position,
-/// and source \param num_bits gives the bit field length in bits.
-///
-/// The result is padded with the sign bit of the extracted field. If the start
-/// position is beyond the msb of the input, the result was filled with the
-/// replicated sign bit of the extracted field.
-///
-/// \tparam T The type of \param source value, must be an integer.
-/// \param source The source value to extracting.
-/// \param bit_start The position to start extracting.
-/// \param num_bits The number of bits to extracting.
-template <typename T>
-inline std::enable_if_t<std::is_integral_v<T>, T>
-bfe_safe(const T source, const uint32_t bit_start, const uint32_t num_bits) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  if constexpr (std::is_same_v<T, int8_t> || std::is_same_v<T, int16_t> ||
-                std::is_same_v<T, int32_t>) {
-    int32_t res{};
-    asm volatile("bfe.s32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"((int32_t)source), "r"(bit_start), "r"(num_bits));
-    return res;
-  } else if constexpr (std::is_same_v<T, uint8_t> ||
-                       std::is_same_v<T, uint16_t> ||
-                       std::is_same_v<T, uint32_t>) {
-    uint32_t res{};
-    asm volatile("bfe.u32 %0, %1, %2, %3;"
-                 : "=r"(res)
-                 : "r"((uint32_t)source), "r"(bit_start), "r"(num_bits));
-    return res;
-  } else if constexpr (std::is_same_v<T, int64_t>) {
-    T res{};
-    asm volatile("bfe.s64 %0, %1, %2, %3;"
-                 : "=l"(res)
-                 : "l"(source), "r"(bit_start), "r"(num_bits));
-    return res;
-  } else if constexpr (std::is_same_v<T, uint64_t>) {
-    T res{};
-    asm volatile("bfe.u64 %0, %1, %2, %3;"
-                 : "=l"(res)
-                 : "l"(source), "r"(bit_start), "r"(num_bits));
-    return res;
-  }
-#endif
-  const uint32_t bit_width = CHAR_BIT * sizeof(T);
-  const uint32_t pos = std::min(bit_start, bit_width);
-  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
-  if constexpr (std::is_signed_v<T>) {
-    const T mask = (T{1} << len) - 1;
-
-    // Find the sign-bit, the result is padded with the sign bit of the
-    // extracted field.
-    //
-    // sign_bit = len == 0 ? 0 : source[min(pos + len - 1, bit_width - 1)]
-    const uint32_t sign_bit_pos = std::min(pos + len - 1, bit_width - 1);
-    const T sign_bit = len != 0 && ((source >> sign_bit_pos) & 1);
-    const T sign_bit_padding = (-sign_bit & ~mask);
-    return ((source >> pos) & mask) | sign_bit_padding;
-  } else {
-    return dpct::bfe(source, pos, len);
-  }
-}
-
-/// Bitfield-insert.
-///
-/// \tparam T The type of \param x and \param y , must be an unsigned integer.
-/// \param x The source of the bitfield.
-/// \param y The source where bitfield is inserted.
-/// \param bit_start The position to start insertion.
-/// \param num_bits The number of bits to insertion.
-template <typename T>
-inline std::enable_if_t<std::is_unsigned_v<T>, T>
-bfi(const T x, const T y, const uint32_t bit_start, const uint32_t num_bits) {
-  constexpr unsigned bit_width = CHAR_BIT * sizeof(T);
-
-  // if bit_start > bit_width || len == 0, should return y.
-  const uint32_t ignore_bfi = bit_start > bit_width || num_bits == 0;
-  T extract_bitfield_mask = (~(T{0}) >> (bit_width - num_bits)) << bit_start;
-  T clean_bitfield_mask = ~extract_bitfield_mask;
-  return (y & (-ignore_bfi | clean_bitfield_mask)) |
-         (~-ignore_bfi & ((x << bit_start) & extract_bitfield_mask));
-}
-
-/// Bitfield-insert with boundary checking.
-///
-/// Align and insert a bit field from \param x into \param y . Source \param
-/// bit_start gives the starting bit position for the insertion, and source
-/// \param num_bits gives the bit field length in bits.
-///
-/// \tparam T The type of \param x and \param y , must be an unsigned integer.
-/// \param x The source of the bitfield.
-/// \param y The source where bitfield is inserted.
-/// \param bit_start The position to start insertion.
-/// \param num_bits The number of bits to insertion.
-template <typename T>
-inline std::enable_if_t<std::is_unsigned_v<T>, T>
-bfi_safe(const T x, const T y, const uint32_t bit_start,
-         const uint32_t num_bits) {
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
-  if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, uint16_t> ||
-                std::is_same_v<T, uint32_t>) {
-    uint32_t res{};
-    asm volatile("bfi.b32 %0, %1, %2, %3, %4;"
-                 : "=r"(res)
-                 : "r"((uint32_t)x), "r"((uint32_t)y), "r"(bit_start),
-                   "r"(num_bits));
-    return res;
-  } else if constexpr (std::is_same_v<T, uint64_t>) {
-    uint64_t res{};
-    asm volatile("bfi.b64 %0, %1, %2, %3, %4;"
-                 : "=l"(res)
-                 : "l"(x), "l"(y), "r"(bit_start), "r"(num_bits));
-    return res;
-  }
-#endif
-  constexpr unsigned bit_width = CHAR_BIT * sizeof(T);
-  const uint32_t pos = std::min(bit_start, bit_width);
-  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
-  return dpct::bfi(x, y, pos, len);
-}
-
-/// Determine whether 2 element value is NaN.
-/// \param [in] a The input value
-/// \returns the comparison result
-template <typename T>
-inline std::enable_if_t<T::size() == 2, T> isnan(const T a) {
-  return {detail::isnan(a[0]), detail::isnan(a[1])};
-}
-
-/// Emulated function for __funnelshift_l
-inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
-                                  unsigned int shift) {
-  return (sycl::upsample(high, low) << (shift & 31U)) >> 32;
-}
-
-/// Emulated function for __funnelshift_lc
-inline unsigned int funnelshift_lc(unsigned int low, unsigned int high,
-                                   unsigned int shift) {
-  return (sycl::upsample(high, low) << sycl::min(shift, 32U)) >> 32;
-}
-
-/// Emulated function for __funnelshift_r
-inline unsigned int funnelshift_r(unsigned int low, unsigned int high,
-                                  unsigned int shift) {
-  return (sycl::upsample(high, low) >> (shift & 31U)) & 0xFFFFFFFF;
-}
-
-/// Emulated function for __funnelshift_rc
-inline unsigned int funnelshift_rc(unsigned int low, unsigned int high,
-                                   unsigned int shift) {
-  return (sycl::upsample(high, low) >> sycl::min(shift, 32U)) & 0xFFFFFFFF;
-}
-
-/// cbrt function wrapper.
-template <typename T> inline T cbrt(T val) { return sycl::cbrt((T)val); }
-
-// min function overloads.
-// For floating-point types, `float` or `double` arguments are acceptable.
-// For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
-// `std::int64_t` type arguments are acceptable.
-inline double min(const double a, const float b) {
-  return sycl::fmin(a, static_cast<double>(b));
-}
-inline double min(const float a, const double b) {
-  return sycl::fmin(static_cast<double>(a), b);
-}
-inline float min(const float a, const float b) { return sycl::fmin(a, b); }
-inline double min(const double a, const double b) { return sycl::fmin(a, b); }
-inline std::uint32_t min(const std::uint32_t a, const std::int32_t b) {
-  return sycl::min(a, static_cast<std::uint32_t>(b));
-}
-inline std::uint32_t min(const std::int32_t a, const std::uint32_t b) {
-  return sycl::min(static_cast<std::uint32_t>(a), b);
-}
-inline std::int32_t min(const std::int32_t a, const std::int32_t b) {
-  return sycl::min(a, b);
-}
-inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b) {
-  return sycl::min(a, b);
-}
-inline std::uint64_t min(const std::uint64_t a, const std::int64_t b) {
-  return sycl::min(a, static_cast<std::uint64_t>(b));
-}
-inline std::uint64_t min(const std::int64_t a, const std::uint64_t b) {
-  return sycl::min(static_cast<std::uint64_t>(a), b);
-}
-inline std::int64_t min(const std::int64_t a, const std::int64_t b) {
-  return sycl::min(a, b);
-}
-inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b) {
-  return sycl::min(a, b);
-}
-inline std::uint64_t min(const std::uint64_t a, const std::int32_t b) {
-  return sycl::min(a, static_cast<std::uint64_t>(b));
-}
-inline std::uint64_t min(const std::int32_t a, const std::uint64_t b) {
-  return sycl::min(static_cast<std::uint64_t>(a), b);
-}
-inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b) {
-  return sycl::min(a, static_cast<std::uint64_t>(b));
-}
-inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b) {
-  return sycl::min(static_cast<std::uint64_t>(a), b);
-}
-// max function overloads.
-// For floating-point types, `float` or `double` arguments are acceptable.
-// For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
-// `std::int64_t` type arguments are acceptable.
-inline double max(const double a, const float b) {
-  return sycl::fmax(a, static_cast<double>(b));
-}
-inline double max(const float a, const double b) {
-  return sycl::fmax(static_cast<double>(a), b);
-}
-inline float max(const float a, const float b) { return sycl::fmax(a, b); }
-inline double max(const double a, const double b) { return sycl::fmax(a, b); }
-inline std::uint32_t max(const std::uint32_t a, const std::int32_t b) {
-  return sycl::max(a, static_cast<std::uint32_t>(b));
-}
-inline std::uint32_t max(const std::int32_t a, const std::uint32_t b) {
-  return sycl::max(static_cast<std::uint32_t>(a), b);
-}
-inline std::int32_t max(const std::int32_t a, const std::int32_t b) {
-  return sycl::max(a, b);
-}
-inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b) {
-  return sycl::max(a, b);
-}
-inline std::uint64_t max(const std::uint64_t a, const std::int64_t b) {
-  return sycl::max(a, static_cast<std::uint64_t>(b));
-}
-inline std::uint64_t max(const std::int64_t a, const std::uint64_t b) {
-  return sycl::max(static_cast<std::uint64_t>(a), b);
-}
-inline std::int64_t max(const std::int64_t a, const std::int64_t b) {
-  return sycl::max(a, b);
-}
-inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b) {
-  return sycl::max(a, b);
-}
-inline std::uint64_t max(const std::uint64_t a, const std::int32_t b) {
-  return sycl::max(a, static_cast<std::uint64_t>(b));
-}
-inline std::uint64_t max(const std::int32_t a, const std::uint64_t b) {
-  return sycl::max(static_cast<std::uint64_t>(a), b);
-}
-inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b) {
-  return sycl::max(a, static_cast<std::uint64_t>(b));
-}
-inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b) {
-  return sycl::max(static_cast<std::uint64_t>(a), b);
-}
-
-// pow functions overload.
-inline float pow(const float a, const int b) { return sycl::pown(a, b); }
-inline double pow(const double a, const int b) { return sycl::pown(a, b); }
-inline float pow(const float a, const float b) { return sycl::pow(a, b); }
-inline double pow(const double a, const double b) { return sycl::pow(a, b); }
-template <typename T, typename U>
-inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
-pow(const T a, const U b) {
-  return sycl::pow(a, static_cast<T>(b));
-}
-template <typename T, typename U>
-inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
-pow(const T a, const U b) {
-  return sycl::pow(static_cast<double>(a), static_cast<double>(b));
-}
-
-namespace detail {
-template <typename T>
-constexpr bool is_floating_point =
-    std::disjunction_v<std::is_floating_point<T>, std::is_same<T, sycl::half>
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-                       ,
-                       std::is_same<T, sycl::ext::oneapi::bfloat16>
-#endif
-                       >;
-} // namespace detail
-
-/// Performs relu saturation.
-/// \param [in] a The input value
-/// \returns the relu saturation result
-template <typename T> inline T relu(T a) {
-  T zero{};
-  if constexpr (detail::is_floating_point<T>)
-    return !detail::isnan(a) && a < zero ? zero : a;
-  else
-    return a < zero ? zero : a;
-}
-template <class T> inline sycl::vec<T, 2> relu(const sycl::vec<T, 2> a) {
-  return {relu(a[0]), relu(a[1])};
-}
-template <class T> inline sycl::marray<T, 2> relu(const sycl::marray<T, 2> a) {
-  return {relu(a[0]), relu(a[1])};
-}
-
-/// Performs complex number multiply addition.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns the operation result
-template <typename T>
-inline sycl::vec<T, 2> complex_mul_add(const sycl::vec<T, 2> a,
-                                       const sycl::vec<T, 2> b,
-                                       const sycl::vec<T, 2> c) {
-  return sycl::vec<T, 2>{a[0] * b[0] - a[1] * b[1] + c[0],
-                         a[0] * b[1] + a[1] * b[0] + c[1]};
-}
-template <typename T>
-inline sycl::marray<T, 2> complex_mul_add(const sycl::marray<T, 2> a,
-                                          const sycl::marray<T, 2> b,
-                                          const sycl::marray<T, 2> c) {
-  return sycl::marray<T, 2>{a[0] * b[0] - a[1] * b[1] + c[0],
-                            a[0] * b[1] + a[1] * b[0] + c[1]};
-}
-
-/// Performs 2 elements comparison and returns the bigger one. If either of
-/// inputs is NaN, then return NaN.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns the bigger value
-template <typename T> inline T fmax_nan(const T a, const T b) {
-  if (detail::isnan(a) || detail::isnan(b))
-    return NAN;
-  return sycl::fmax(a, b);
-}
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-template <>
-inline sycl::ext::oneapi::bfloat16
-fmax_nan(const sycl::ext::oneapi::bfloat16 a,
-         const sycl::ext::oneapi::bfloat16 b) {
-  if (detail::isnan(a) || detail::isnan(b))
-    return NAN;
-  return sycl::fmax(float(a), float(b));
-}
-#endif
-template <typename T>
-inline sycl::vec<T, 2> fmax_nan(const sycl::vec<T, 2> a,
-                                const sycl::vec<T, 2> b) {
-  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
-}
-template <typename T>
-inline sycl::marray<T, 2> fmax_nan(const sycl::marray<T, 2> a,
-                                   const sycl::marray<T, 2> b) {
-  return {fmax_nan(a[0], b[0]), fmax_nan(a[1], b[1])};
-}
-
-/// Performs 2 elements comparison and returns the smaller one. If either of
-/// inputs is NaN, then return NaN.
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns the smaller value
-template <typename T> inline T fmin_nan(const T a, const T b) {
-  if (detail::isnan(a) || detail::isnan(b))
-    return NAN;
-  return sycl::fmin(a, b);
-}
-#ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
-template <>
-inline sycl::ext::oneapi::bfloat16
-fmin_nan(const sycl::ext::oneapi::bfloat16 a,
-         const sycl::ext::oneapi::bfloat16 b) {
-  if (detail::isnan(a) || detail::isnan(b))
-    return NAN;
-  return sycl::fmin(float(a), float(b));
-}
-#endif
-template <typename T>
-inline sycl::vec<T, 2> fmin_nan(const sycl::vec<T, 2> a,
-                                const sycl::vec<T, 2> b) {
-  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
-}
-template <typename T>
-inline sycl::marray<T, 2> fmin_nan(const sycl::marray<T, 2> a,
-                                   const sycl::marray<T, 2> b) {
-  return {fmin_nan(a[0], b[0]), fmin_nan(a[1], b[1])};
-}
-
-/// A sycl::abs wrapper functors.
-struct abs {
-  template <typename T> auto operator()(const T x) const {
-    return sycl::abs(x);
-  }
-};
-
-/// A sycl::abs_diff wrapper functors.
-struct abs_diff {
-  template <typename T> auto operator()(const T x, const T y) const {
-    return sycl::abs_diff(x, y);
-  }
-};
-
-/// A sycl::add_sat wrapper functors.
-struct add_sat {
-  template <typename T> auto operator()(const T x, const T y) const {
-    return sycl::add_sat(x, y);
-  }
-};
-
-/// A sycl::rhadd wrapper functors.
-struct rhadd {
-  template <typename T> auto operator()(const T x, const T y) const {
-    return sycl::rhadd(x, y);
-  }
-};
-
-/// A sycl::hadd wrapper functors.
-struct hadd {
-  template <typename T> auto operator()(const T x, const T y) const {
-    return sycl::hadd(x, y);
-  }
-};
-
-/// A sycl::max wrapper functors.
-struct maximum {
-  template <typename T> auto operator()(const T x, const T y) const {
-    return sycl::max(x, y);
-  }
-};
-
-/// A sycl::min wrapper functors.
-struct minimum {
-  template <typename T> auto operator()(const T x, const T y) const {
-    return sycl::min(x, y);
-  }
-};
-
-/// A sycl::sub_sat wrapper functors.
-struct sub_sat {
-  template <typename T> auto operator()(const T x, const T y) const {
-    return sycl::sub_sat(x, y);
-  }
-};
-
-/// Compute vectorized binary operation value for two values, with each value
-/// treated as a vector type \p VecT.
-/// \tparam [in] VecT The type of the vector
-/// \tparam [in] BinaryOperation The binary operation class
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized binary operation value of the two values
-template <typename VecT, class BinaryOperation>
-inline unsigned vectorized_binary(unsigned a, unsigned b,
-                                  const BinaryOperation binary_op) {
-  sycl::vec<unsigned, 1> v0{a}, v1{b};
-  auto v2 = v0.as<VecT>();
-  auto v3 = v1.as<VecT>();
-  auto v4 =
-      detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
-  v0 = v4.template as<sycl::vec<unsigned, 1>>();
-  return v0;
-}
-
-/// Compute vectorized isgreater for two values, with each value treated as a
-/// vector type \p S.
-/// \tparam [in] S The type of the vector
-/// \tparam [in] T The type of the original values
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized greater than of the two values
-template <typename S, typename T> inline T vectorized_isgreater(T a, T b) {
-  sycl::vec<T, 1> v0{a}, v1{b};
-  auto v2 = v0.template as<S>();
-  auto v3 = v1.template as<S>();
-  auto v4 = v2 > v3;
-  v0 = v4.template as<sycl::vec<T, 1>>();
-  return v0;
-}
-
-/// Compute vectorized max for two values, with each value treated as a vector
-/// type \p S.
-/// \tparam [in] S The type of the vector
-/// \tparam [in] T The type of the original values
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized max of the two values
-template <typename S, typename T> inline T vectorized_max(T a, T b) {
-  sycl::vec<T, 1> v0{a}, v1{b};
-  auto v2 = v0.template as<S>();
-  auto v3 = v1.template as<S>();
-  auto v4 = sycl::max(v2, v3);
-  v0 = v4.template as<sycl::vec<T, 1>>();
-  return v0;
-}
-
-/// Compute vectorized min for two values, with each value treated as a vector
-/// type \p S.
-/// \tparam [in] S The type of the vector
-/// \tparam [in] T The type of the original values
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized min of the two values
-template <typename S, typename T> inline T vectorized_min(T a, T b) {
-  sycl::vec<T, 1> v0{a}, v1{b};
-  auto v2 = v0.template as<S>();
-  auto v3 = v1.template as<S>();
-  auto v4 = sycl::min(v2, v3);
-  v0 = v4.template as<sycl::vec<T, 1>>();
-  return v0;
-}
-
-/// Compute vectorized unary operation for a value, with the value treated as a
-/// vector type \p VecT.
-/// \tparam [in] VecT The type of the vector
-/// \tparam [in] UnaryOperation The unary operation class
-/// \param [in] a The input value
-/// \returns The vectorized unary operation value of the input value
-template <typename VecT, class UnaryOperation>
-inline unsigned vectorized_unary(unsigned a, const UnaryOperation unary_op) {
-  sycl::vec<unsigned, 1> v0{a};
-  auto v1 = v0.as<VecT>();
-  auto v2 = unary_op(v1);
-  v0 = v2.template as<sycl::vec<unsigned, 1>>();
-  return v0;
-}
-
-/// Compute vectorized absolute difference for two values without modulo
-/// overflow, with each value treated as a vector type \p VecT.
-/// \tparam [in] VecT The type of the vector
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The vectorized absolute difference of the two values
-template <typename VecT>
-inline unsigned vectorized_sum_abs_diff(unsigned a, unsigned b) {
-  sycl::vec<unsigned, 1> v0{a}, v1{b};
-  auto v2 = v0.as<VecT>();
-  auto v3 = v1.as<VecT>();
-  auto v4 = sycl::abs_diff(v2, v3);
-  unsigned sum = 0;
-  for (size_t i = 0; i < v4.size(); ++i) {
-    sum += v4[i];
-  }
-  return sum;
-}
-
-namespace detail {
-/// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed
-/// extend for signed int.
-template <typename T> inline auto zero_or_signed_extent(T val, unsigned bit) {
-  if constexpr (std::is_signed_v<T>) {
-    if constexpr (std::is_same_v<T, int32_t>) {
-      assert(bit < 64 &&
-             "When extend int32 value, bit must be smaller than 64.");
-      return int64_t(val) << (64 - bit) >> (64 - bit);
-    } else if constexpr (std::is_same_v<T, int16_t>) {
-      assert(bit < 32 &&
-             "When extend int16 value, bit must be smaller than 32.");
-      return int32_t(val) << (32 - bit) >> (32 - bit);
-    } else if constexpr (std::is_same_v<T, int8_t>) {
-      assert(bit < 16 &&
-             "When extend int8 value, bit must be smaller than 16.");
-      return int16_t(val) << (16 - bit) >> (16 - bit);
-    } else {
-      assert(bit < 64 && "Cannot extend int64 value.");
-      return val;
-    }
-  } else
-    return val;
-}
-
-template <typename RetT, bool NeedSat, typename AT, typename BT,
-          typename BinaryOperation>
-inline constexpr std::enable_if_t<
-    std::is_integral_v<AT> && std::is_integral_v<BT> &&
-        std::is_integral_v<RetT> && sizeof(AT) == 4 && sizeof(BT) == 4 &&
-        sizeof(RetT) == 4,
-    RetT>
-extend_binary(AT a, BT b, BinaryOperation binary_op) {
-  int64_t extend_a = zero_or_signed_extent(a, 33);
-  int64_t extend_b = zero_or_signed_extent(b, 33);
-  int64_t ret = binary_op(extend_a, extend_b);
-  if constexpr (NeedSat)
-    return dpct::clamp<int64_t>(ret, std::numeric_limits<RetT>::min(),
-                                std::numeric_limits<RetT>::max());
-  return ret;
-}
-
-template <typename RetT, bool NeedSat, typename AT, typename BT, typename CT,
-          typename BinaryOperation1, typename BinaryOperation2>
-inline constexpr std::enable_if_t<
-    std::is_integral_v<AT> && std::is_integral_v<BT> &&
-        std::is_integral_v<CT> && std::is_integral_v<RetT> && sizeof(AT) == 4 &&
-        sizeof(BT) == 4 && sizeof(CT) == 4 && sizeof(RetT) == 4,
-    RetT>
-extend_binary(AT a, BT b, CT c, BinaryOperation1 binary_op,
-              BinaryOperation2 second_op) {
-  int64_t extend_a = zero_or_signed_extent(a, 33);
-  int64_t extend_b = zero_or_signed_extent(b, 33);
-  int64_t extend_temp =
-      zero_or_signed_extent(binary_op(extend_a, extend_b), 34);
-  if constexpr (NeedSat)
-    extend_temp =
-        dpct::clamp<int64_t>(extend_temp, std::numeric_limits<RetT>::min(),
-                             std::numeric_limits<RetT>::max());
-  int64_t extend_c = zero_or_signed_extent(c, 33);
-  return second_op(extend_temp, extend_c);
-}
-
-template <typename T> sycl::vec<int32_t, 2> extractAndExtend2(T a) {
-  sycl::vec<int32_t, 2> ret;
-  sycl::vec<T, 1> va{a};
-  if constexpr (std::is_signed_v<T>) {
-    auto v = va.template as<sycl::vec<int16_t, 2>>();
-    ret[0] = zero_or_signed_extent(v[0], 17);
-    ret[1] = zero_or_signed_extent(v[1], 17);
-  } else {
-    auto v = va.template as<sycl::vec<uint16_t, 2>>();
-    ret[0] = zero_or_signed_extent(v[0], 17);
-    ret[1] = zero_or_signed_extent(v[1], 17);
-  }
-  return ret;
-}
-
-template <typename T> sycl::vec<int16_t, 4> extractAndExtend4(T a) {
-  sycl::vec<int16_t, 4> ret;
-  sycl::vec<T, 1> va{a};
-  if constexpr (std::is_signed_v<T>) {
-    auto v = va.template as<sycl::vec<int8_t, 4>>();
-    ret[0] = zero_or_signed_extent(v[0], 9);
-    ret[1] = zero_or_signed_extent(v[1], 9);
-    ret[2] = zero_or_signed_extent(v[2], 9);
-    ret[3] = zero_or_signed_extent(v[3], 9);
-  } else {
-    auto v = va.template as<sycl::vec<uint8_t, 4>>();
-    ret[0] = zero_or_signed_extent(v[0], 9);
-    ret[1] = zero_or_signed_extent(v[1], 9);
-    ret[2] = zero_or_signed_extent(v[2], 9);
-    ret[3] = zero_or_signed_extent(v[3], 9);
-  }
-  return ret;
-}
-
-template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
-          typename BinaryOperation>
-inline constexpr std::enable_if_t<
-    std::is_integral_v<AT> && std::is_integral_v<BT> &&
-        std::is_integral_v<RetT> && sizeof(AT) == 4 && sizeof(BT) == 4 &&
-        sizeof(RetT) == 4,
-    RetT>
-extend_vbinary2(AT a, BT b, RetT c, BinaryOperation binary_op) {
-  sycl::vec<int32_t, 2> extend_a = extractAndExtend2(a);
-  sycl::vec<int32_t, 2> extend_b = extractAndExtend2(b);
-  sycl::vec<int32_t, 2> temp{binary_op(extend_a[0], extend_b[0]),
-                             binary_op(extend_a[1], extend_b[1])};
-  if constexpr (NeedSat) {
-    int32_t min_val = 0, max_val = 0;
-    if constexpr (std::is_signed_v<RetT>) {
-      min_val = std::numeric_limits<int16_t>::min();
-      max_val = std::numeric_limits<int16_t>::max();
-    } else {
-      min_val = std::numeric_limits<uint16_t>::min();
-      max_val = std::numeric_limits<uint16_t>::max();
-    }
-    temp = dpct::clamp(temp, {min_val, min_val}, {max_val, max_val});
-  }
-  if constexpr (NeedAdd) {
-    return temp[0] + temp[1] + c;
-  }
-  if constexpr (std::is_signed_v<RetT>) {
-    return sycl::vec<int16_t, 2>{temp[0], temp[1]}.as<sycl::vec<RetT, 1>>();
-  } else {
-    return sycl::vec<uint16_t, 2>{temp[0], temp[1]}.as<sycl::vec<RetT, 1>>();
-  }
-}
-
-template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
-          typename BinaryOperation>
-inline constexpr std::enable_if_t<
-    std::is_integral_v<AT> && std::is_integral_v<BT> &&
-        std::is_integral_v<RetT> && sizeof(AT) == 4 && sizeof(BT) == 4 &&
-        sizeof(RetT) == 4,
-    RetT>
-extend_vbinary4(AT a, BT b, RetT c, BinaryOperation binary_op) {
-  sycl::vec<int16_t, 4> extend_a = extractAndExtend4(a);
-  sycl::vec<int16_t, 4> extend_b = extractAndExtend4(b);
-  sycl::vec<int16_t, 4> temp{
-      binary_op(extend_a[0], extend_b[0]), binary_op(extend_a[1], extend_b[1]),
-      binary_op(extend_a[2], extend_b[2]), binary_op(extend_a[3], extend_b[3])};
-  if constexpr (NeedSat) {
-    int16_t min_val = 0, max_val = 0;
-    if constexpr (std::is_signed_v<RetT>) {
-      min_val = std::numeric_limits<int8_t>::min();
-      max_val = std::numeric_limits<int8_t>::max();
-    } else {
-      min_val = std::numeric_limits<uint8_t>::min();
-      max_val = std::numeric_limits<uint8_t>::max();
-    }
-    temp = dpct::clamp(temp, {min_val, min_val, min_val, min_val},
-                       {max_val, max_val, max_val, max_val});
-  }
-  if constexpr (NeedAdd) {
-    return temp[0] + temp[1] + temp[2] + temp[3] + c;
-  }
-  if constexpr (std::is_signed_v<RetT>) {
-    return sycl::vec<int8_t, 4>{temp[0], temp[1], temp[2], temp[3]}
-        .as<sycl::vec<RetT, 1>>();
-  } else {
-    return sycl::vec<uint8_t, 4>{temp[0], temp[1], temp[2], temp[3]}
-        .as<sycl::vec<RetT, 1>>();
-  }
-}
-
-template <typename T1, typename T2>
-using dot_product_acc_t =
-    std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-                       uint32_t, int32_t>;
-
-template <typename T> sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
-  return sycl::vec<T, 1>(val)
-      .template as<sycl::vec<
-          std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
-      .template convert<T>();
-}
-
-template <typename T> sycl::vec<T, 2> extract_and_sign_or_zero_extend2(T val) {
-  return sycl::vec<T, 1>(val)
-      .template as<sycl::vec<
-          std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>, 2>>()
-      .template convert<T>();
-}
-} // namespace detail
-
-/// Two-way dot product-accumulate. Calculate and return interger_vector2(
-/// \param a) dot product interger_vector2(low16_bit( \param b))  + \param c
-///
-/// \tparam [in] T1 The type of first value.
-/// \tparam [in] T2 The type of second value.
-/// \param [in] a The first value.
-/// \param [in] b The second value.
-/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
-/// uint32_t else has type int32_t.
-/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
-/// result.
-template <typename T1, typename T2, typename T3>
-inline auto dp2a_lo(T1 a, T2 b, T3 c) {
-  detail::dot_product_acc_t<T1, T2> res = c;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
-    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
-  res = __dp2a_lo(a, b, c);
-#else
-  auto va = ::dpct::detail::extract_and_sign_or_zero_extend2(a);
-  auto vb = ::dpct::detail::extract_and_sign_or_zero_extend4(b);
-  res += va[0] * vb[0];
-  res += va[1] * vb[1];
-#endif
-  return res;
-}
-
-/// Two-way dot product-accumulate. Calculate and return interger_vector2(
-/// \param a) dot product interger_vector2(high_16bit( \param b)) + \param c
-///
-/// \tparam [in] T1 The type of first value.
-/// \tparam [in] T2 The type of second value.
-/// \param [in] a The first value.
-/// \param [in] b The second value.
-/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
-/// uint32_t else has type int32_t.
-/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
-/// result.
-template <typename T1, typename T2, typename T3>
-inline auto dp2a_hi(T1 a, T2 b, T3 c) {
-  detail::dot_product_acc_t<T1, T2> res = c;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
-    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
-  res = __dp2a_hi(a, b, c);
-#else
-  auto va = ::dpct::detail::extract_and_sign_or_zero_extend2(a);
-  auto vb = ::dpct::detail::extract_and_sign_or_zero_extend4(b);
-  res += va[0] * vb[2];
-  res += va[1] * vb[3];
-#endif
-  return res;
-}
-
-/// Four-way byte dot product-accumulate. Calculate and return interger_vector4(
-/// \param a) dot product interger_vector4( \param b)  + \param c
-///
-/// \tparam [in] T1 The type of first value.
-/// \tparam [in] T2 The type of second value.
-/// \param [in] a The first value.
-/// \param [in] b The second value.
-/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
-/// uint32_t else has type int32_t.
-/// \return Four-way byte dot product which is accumulated in 32-bit result.
-template <typename T1, typename T2, typename T3>
-inline auto dp4a(T1 a, T2 b, T3 c) {
-  detail::dot_product_acc_t<T1, T2> res = c;
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
-    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
-  res = __dp4a(a, b, c);
-#else
-  auto va = ::dpct::detail::extract_and_sign_or_zero_extend4(a);
-  auto vb = ::dpct::detail::extract_and_sign_or_zero_extend4(b);
-  res += va[0] * vb[0];
-  res += va[1] * vb[1];
-  res += va[2] * vb[2];
-  res += va[3] * vb[3];
-#endif
-  return res;
-}
-
-/// Extend \p a and \p b to 33 bit and add them.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend addition of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_add(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, std::plus());
-}
-
-/// Extend Inputs to 33 bit, add \p a, \p b, then do \p second_op with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend addition of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_add(AT a, BT b, CT c, BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, std::plus(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and add them with saturation.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend addition of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_add_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, std::plus());
-}
-
-/// Extend Inputs to 33 bit, add \p a, \p b with saturation, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend addition of \p a, \p b with saturation and \p second_op
-/// with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_add_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, std::plus(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and minus them.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend subtraction of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_sub(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, std::minus());
-}
-
-/// Extend Inputs to 33 bit, minus \p a, \p b, then do \p second_op with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend subtraction of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_sub(AT a, BT b, CT c, BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, std::minus(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and minus them with saturation.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend subtraction of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_sub_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, std::minus());
-}
-
-/// Extend Inputs to 33 bit, minus \p a, \p b with saturation, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend subtraction of \p a, \p b with saturation and \p
-/// second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_sub_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, std::minus(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and do abs_diff.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend abs_diff of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_absdiff(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, abs_diff());
-}
-
-/// Extend Inputs to 33 bit, abs_diff \p a, \p b, then do \p second_op with \p
-/// c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend abs_diff of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_absdiff(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, abs_diff(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and do abs_diff with saturation.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The extend abs_diff of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_absdiff_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, abs_diff());
-}
-
-/// Extend Inputs to 33 bit, abs_diff \p a, \p b with saturation, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The extend abs_diff of \p a, \p b with saturation and \p
-/// second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_absdiff_sat(AT a, BT b, CT c,
-                                         BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, abs_diff(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return smaller one.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The smaller one of the two extended values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_min(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, minimum());
-}
-
-/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The smaller one of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_min(AT a, BT b, CT c, BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, minimum(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return smaller one with saturation.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The smaller one of the two extended values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_min_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, minimum());
-}
-
-/// Extend Inputs to 33 bit, find the smaller one in \p a, \p b with saturation,
-/// then do \p second_op with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The smaller one of \p a, \p b with saturation and \p
-/// second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_min_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, minimum(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return bigger one.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The bigger one of the two extended values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_max(AT a, BT b) {
-  return detail::extend_binary<RetT, false>(a, b, maximum());
-}
-
-/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b, then do \p
-/// second_op with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The bigger one of \p a, \p b and \p second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_max(AT a, BT b, CT c, BinaryOperation second_op) {
-  return detail::extend_binary<RetT, false>(a, b, c, maximum(), second_op);
-}
-
-/// Extend \p a and \p b to 33 bit and return bigger one with saturation.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \returns The bigger one of the two extended values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_max_sat(AT a, BT b) {
-  return detail::extend_binary<RetT, true>(a, b, maximum());
-}
-
-/// Extend Inputs to 33 bit, find the bigger one in \p a, \p b with saturation,
-/// then do \p second_op with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \tparam [in] CT The type of the third value, can only be 32 bit integer
-/// \tparam [in] BinaryOperation The type of the second operation
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \param [in] second_op The operation to do with the third value
-/// \returns The bigger one of \p a, \p b with saturation and \p
-/// second_op with \p c
-template <typename RetT, typename AT, typename BT, typename CT,
-          typename BinaryOperation>
-inline constexpr RetT extend_max_sat(AT a, BT b, CT c,
-                                     BinaryOperation second_op) {
-  return detail::extend_binary<RetT, true>(a, b, c, maximum(), second_op);
-}
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a
-/// 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::plus());
-}
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized addition of the two
-/// values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::plus());
-}
-
-/// Compute vectorized addition of \p a and \p b with saturation, with each
-/// value treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::plus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::minus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 2 elements vector type and extend each element to 17 bit. Then add each
-/// half of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized subtraction of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::minus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b with saturation, with each
-/// value treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::minus());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized abs_diff of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
-/// value treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, minimum());
-}
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized minimum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, minimum());
-}
-
-/// Compute vectorized minimum of \p a and \p b with saturation, with each value
-/// treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, minimum());
-}
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax2(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, false>(a, b, c, maximum());
-}
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
-/// elements vector type and extend each element to 17 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized maximum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, false, true>(a, b, c, maximum());
-}
-
-/// Compute vectorized maximum of \p a and \p b with saturation, with each value
-/// treated as a 2 elements vector type and extend each element to 17 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary2<RetT, true, false>(a, b, c, maximum());
-}
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a
-/// 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::plus());
-}
-
-/// Compute vectorized addition of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized addition of the two
-/// values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::plus());
-}
-
-/// Compute vectorized addition of \p a and \p b with saturation, with each
-/// value treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized addition of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::plus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::minus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b, with each value treated as
-/// a 4 elements vector type and extend each element to 9 bit. Then add each
-/// half of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized subtraction of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::minus());
-}
-
-/// Compute vectorized subtraction of \p a and \p b with saturation, with each
-/// value treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized subtraction of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::minus());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized abs_diff of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
-/// value treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized abs_diff of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, abs_diff());
-}
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, minimum());
-}
-
-/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized minimum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, minimum());
-}
-
-/// Compute vectorized minimum of \p a and \p b with saturation, with each value
-/// treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized minimum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, minimum());
-}
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax4(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, false>(a, b, c, maximum());
-}
-
-/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
-/// elements vector type and extend each element to 9 bit. Then add each half
-/// of the result and add with \p c.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The addition of each half of extend vectorized maximum of the
-/// two values and the third value
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, false, true>(a, b, c, maximum());
-}
-
-/// Compute vectorized maximum of \p a and \p b with saturation, with each value
-/// treated as a 4 elements vector type and extend each element to 9 bit.
-/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
-/// \tparam [in] AT The type of the first value, can only be 32 bit integer
-/// \tparam [in] BT The type of the second value, can only be 32 bit integer
-/// \param [in] a The first value
-/// \param [in] b The second value
-/// \param [in] c The third value
-/// \returns The extend vectorized maximum of the two values with saturation
-template <typename RetT, typename AT, typename BT>
-inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c) {
-  return detail::extend_vbinary4<RetT, true, false>(a, b, c, maximum());
-}
-} // namespace dpct
-
-#endif // __DPCT_MATH_HPP__
diff --git a/dpct/memory.hpp b/dpct/memory.hpp
deleted file mode 100644
index 88afe77fafcf3..0000000000000
--- a/dpct/memory.hpp
+++ /dev/null
@@ -1,1497 +0,0 @@
-//==---- memory.hpp -------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_MEMORY_HPP__
-#define __DPCT_MEMORY_HPP__
-
-#include "device.hpp"
-#include <sycl/sycl.hpp>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <mutex>
-#include <unordered_map>
-#include <map>
-#include <utility>
-#include <thread>
-#include <type_traits>
-
-#if defined(__linux__)
-#include <sys/mman.h>
-#elif defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#else
-#error "Only support Windows and Linux."
-#endif
-
-namespace dpct {
-
-enum memcpy_direction {
-  host_to_host,
-  host_to_device,
-  device_to_host,
-  device_to_device,
-  automatic
-};
-enum memory_region {
-  global = 0,  // device global memory
-  constant,    // device constant memory
-  local,       // device local memory
-  shared,      // memory which can be accessed by host and device
-};
-
-typedef uint8_t byte_t;
-
-/// Buffer type to be used in Memory Management runtime.
-typedef sycl::buffer<byte_t> buffer_t;
-
-/// Pitched 2D/3D memory data.
-class pitched_data {
-public:
-  pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
-  pitched_data(void *data, size_t pitch, size_t x, size_t y)
-      : _data(data), _pitch(pitch), _x(x), _y(y) {}
-
-  void *get_data_ptr() { return _data; }
-  void set_data_ptr(void *data) { _data = data; }
-
-  size_t get_pitch() { return _pitch; }
-  void set_pitch(size_t pitch) { _pitch = pitch; }
-
-  size_t get_x() { return _x; }
-  void set_x(size_t x) { _x = x; };
-
-  size_t get_y() { return _y; }
-  void set_y(size_t y) { _y = y; }
-
-private:
-  void *_data;
-  size_t _pitch, _x, _y;
-};
-
-namespace detail {
-class mem_mgr {
-  mem_mgr() {
-    // Reserved address space, no real memory allocation happens here.
-#if defined(__linux__)
-    mapped_address_space =
-        (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
-                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-#elif defined(_WIN64)
-    mapped_address_space = (byte_t *)VirtualAlloc(
-        NULL,               // NULL specified as the base address parameter
-        mapped_region_size, // Size of allocation
-        MEM_RESERVE,        // Allocate reserved pages
-        PAGE_NOACCESS);     // Protection = no access
-#else
-#error "Only support Windows and Linux."
-#endif
-    next_free = mapped_address_space;
-  };
-
-public:
-  using buffer_id_t = int;
-
-  struct allocation {
-    buffer_t buffer;
-    byte_t *alloc_ptr;
-    size_t size;
-  };
-
-  ~mem_mgr() {
-#if defined(__linux__)
-    munmap(mapped_address_space, mapped_region_size);
-#elif defined(_WIN64)
-    VirtualFree(mapped_address_space, 0, MEM_RELEASE);
-#else
-#error "Only support Windows and Linux."
-#endif
-  };
-
-  mem_mgr(const mem_mgr &) = delete;
-  mem_mgr &operator=(const mem_mgr &) = delete;
-  mem_mgr(mem_mgr &&) = delete;
-  mem_mgr &operator=(mem_mgr &&) = delete;
-
-  /// Allocate
-  void *mem_alloc(size_t size) {
-    if (!size)
-      return nullptr;
-    std::lock_guard<std::mutex> lock(m_mutex);
-    if (next_free + size > mapped_address_space + mapped_region_size) {
-      throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
-    }
-    // Allocation
-    sycl::range<1> r(size);
-    buffer_t buf(r);
-    allocation A{buf, next_free, size};
-    // Map allocation to device pointer
-    void *result = next_free;
-    m_map.emplace(next_free + size, A);
-    // Update pointer to the next free space.
-    next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
-
-    return result;
-  }
-
-  /// Deallocate
-  void mem_free(const void *ptr) {
-    if (!ptr)
-      return;
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto it = get_map_iterator(ptr);
-    m_map.erase(it);
-  }
-
-  /// map: device pointer -> allocation(buffer, alloc_ptr, size)
-  allocation translate_ptr(const void *ptr) {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    auto it = get_map_iterator(ptr);
-    return it->second;
-  }
-
-  /// Check if the pointer represents device pointer or not.
-  bool is_device_ptr(const void *ptr) const {
-    std::lock_guard<std::mutex> lock(m_mutex);
-    return (mapped_address_space <= ptr) &&
-           (ptr < mapped_address_space + mapped_region_size);
-  }
-
-  /// Returns the instance of memory manager singleton.
-  static mem_mgr &instance() {
-    static mem_mgr m;
-    return m;
-  }
-
-private:
-  std::map<byte_t *, allocation> m_map;
-  mutable std::mutex m_mutex;
-  byte_t *mapped_address_space;
-  byte_t *next_free;
-  const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
-  const size_t alignment = 256;
-  /// This padding may be defined to some positive value to debug
-  /// out of bound accesses.
-  const size_t extra_padding = 0;
-
-  std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr) {
-    auto it = m_map.upper_bound((byte_t *)ptr);
-    if (it == m_map.end()) {
-      // Not a virtual pointer.
-      throw std::runtime_error("can not get buffer from non-virtual pointer");
-    }
-    const allocation &alloc = it->second;
-    if (ptr < alloc.alloc_ptr) {
-      // Out of bound.
-      // This may happen if there's a gap between allocations due to alignment
-      // or extra padding and pointer points to this gap.
-      throw std::runtime_error("invalid virtual pointer");
-    }
-    return it;
-  }
-};
-
-template <class T, memory_region Memory, size_t Dimension> class accessor;
-template <memory_region Memory, class T = byte_t> class memory_traits {
-public:
-  static constexpr sycl::access::target target =
-      sycl::access::target::device;
-  static constexpr sycl::access_mode mode =
-      (Memory == constant) ? sycl::access_mode::read
-                           : sycl::access_mode::read_write;
-  static constexpr size_t type_size = sizeof(T);
-  using element_t =
-      typename std::conditional<Memory == constant, const T, T>::type;
-  using value_t = typename std::remove_cv<T>::type;
-  template <size_t Dimension = 1>
-  using accessor_t = typename std::conditional<
-      Memory == local, sycl::local_accessor<value_t, Dimension>,
-      sycl::accessor<T, Dimension, mode, target>>::type;
-  using pointer_t = T *;
-};
-
-static inline void *dpct_malloc(size_t size, sycl::queue &q) {
-#ifdef DPCT_USM_LEVEL_NONE
-  return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
-#else
-  return sycl::malloc_device(size, q.get_device(), q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
-}
-
-#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
-static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
-                                sycl::queue &q) {
-  pitch = PITCH_DEFAULT_ALIGN(x);
-  return dpct_malloc(pitch * y * z, q);
-}
-
-/**
- * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
- * @tparam valueT The type of the element to be set.
- * @param [in] q The queue in which the operation is done.
- * @param [in] dev_ptr Pointer to the virtual device memory address.
- * @param [in] value The value to be set.
- * @param [in] size Number of elements to be set to the value.
- * @return An event representing the memset operation.
- */
-template <typename valueT>
-static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
-                                      valueT value, size_t size) {
-#ifdef DPCT_USM_LEVEL_NONE
-  auto &mm = mem_mgr::instance();
-  assert(mm.is_device_ptr(dev_ptr));
-  auto alloc = mm.translate_ptr(dev_ptr);
-  size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
-
-  return q.submit([&](sycl::handler &cgh) {
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    auto new_buffer = alloc.buffer.reinterpret<valueT>(
-        sycl::range<1>(alloc.size / sizeof(valueT)));
-    sycl::accessor<valueT, 1, sycl::access_mode::write,
-                   sycl::access::target::device>
-        acc(new_buffer, cgh, r, o);
-    cgh.fill(acc, value);
-  });
-#else
-  return q.fill(dev_ptr, value, size);
-#endif // DPCT_USM_LEVEL_NONE
-}
-
-/**
- * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
- * @tparam valueT The type of the element to be set.
- * @param [in] q The queue in which the operation is done.
- * @param [in] data Pointer to the pitched device memory region.
- * @param [in] value The value to be set.
- * @param [in] size 3D memory region by number of elements.
- * @return An event list representing the memset operations.
- */
-template<typename valueT>
-static inline std::vector<sycl::event>
-dpct_memset(sycl::queue &q, pitched_data data, valueT value,
-            sycl::range<3> size) {
-  std::vector<sycl::event> event_list;
-  size_t slice = data.get_pitch() * data.get_y();
-  unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
-  for (size_t z = 0; z < size.get(2); ++z) {
-    unsigned char *data_ptr = data_surface;
-    for (size_t y = 0; y < size.get(1); ++y) {
-      event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
-      data_ptr += data.get_pitch();
-    }
-    data_surface += slice;
-  }
-  return event_list;
-}
-
-/**
- * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
- * @tparam valueT The type of the element to be set.
- * @param [in] q The queue in which the operation is done.
- * @param [in] ptr Pointer to the virtual device memory.
- * @param [in] pitch The pitch size by number of elements, including padding.
- * @param [in] val The value to be set.
- * @param [in] x The width of memory region by number of elements.
- * @param [in] y The height of memory region by number of elements.
- * @return An event list representing the memset operations.
- */
-template<typename valueT>
-static inline std::vector<sycl::event>
-dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
-            size_t y) {
-  return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
-                     sycl::range<3>(x, y, 1));
-}
-
-enum class pointer_access_attribute {
-  host_only = 0,
-  device_only,
-  host_device,
-  end
-};
-
-static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
-                                                      const void *ptr) {
-#ifdef DPCT_USM_LEVEL_NONE
-  return mem_mgr::instance().is_device_ptr(ptr)
-             ? pointer_access_attribute::device_only
-             : pointer_access_attribute::host_only;
-#else
-  switch (sycl::get_pointer_type(ptr, q.get_context())) {
-  case sycl::usm::alloc::unknown:
-    return pointer_access_attribute::host_only;
-  case sycl::usm::alloc::device:
-    return pointer_access_attribute::device_only;
-  case sycl::usm::alloc::shared:
-  case sycl::usm::alloc::host:
-    return pointer_access_attribute::host_device;
-  }
-#endif
-}
-
-static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
-                                             const void *from_ptr,
-                                             memcpy_direction dir) {
-  switch (dir) {
-  case memcpy_direction::host_to_host:
-  case memcpy_direction::host_to_device:
-  case memcpy_direction::device_to_host:
-  case memcpy_direction::device_to_device:
-    return dir;
-  case memcpy_direction::automatic: {
-    // table[to_attribute][from_attribute]
-    static const memcpy_direction
-        direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
-                       [static_cast<unsigned>(pointer_access_attribute::end)] =
-                           {{memcpy_direction::host_to_host,
-                             memcpy_direction::device_to_host,
-                             memcpy_direction::host_to_host},
-                            {memcpy_direction::host_to_device,
-                             memcpy_direction::device_to_device,
-                             memcpy_direction::device_to_device},
-                            {memcpy_direction::host_to_host,
-                             memcpy_direction::device_to_device,
-                             memcpy_direction::device_to_device}};
-    return direction_table[static_cast<unsigned>(get_pointer_attribute(
-        q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
-  }
-  default:
-    throw std::runtime_error("dpct_memcpy: invalid direction value");
-  }
-}
-
-static sycl::event
-dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-            memcpy_direction direction,
-            const std::vector<sycl::event> &dep_events = {}) {
-  if (!size)
-    return sycl::event{};
-#ifdef DPCT_USM_LEVEL_NONE
-  auto &mm = mem_mgr::instance();
-  auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-
-  switch (real_direction) {
-  case host_to_host:
-    return q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); });
-    });
-  case host_to_device: {
-    auto alloc = mm.translate_ptr(to_ptr);
-    size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-    return q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      auto r = sycl::range<1>(size);
-      auto o = sycl::id<1>(offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                          sycl::access::target::device>
-          acc(alloc.buffer, cgh, r, o);
-      cgh.copy(from_ptr, acc);
-    });
-  }
-  case device_to_host: {
-    auto alloc = mm.translate_ptr(from_ptr);
-    size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-    return q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      auto r = sycl::range<1>(size);
-      auto o = sycl::id<1>(offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                          sycl::access::target::device>
-          acc(alloc.buffer, cgh, r, o);
-      cgh.copy(acc, to_ptr);
-    });
-  }
-  case device_to_device: {
-    auto to_alloc = mm.translate_ptr(to_ptr);
-    auto from_alloc = mm.translate_ptr(from_ptr);
-    size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-    size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-    return q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      auto r = sycl::range<1>(size);
-      auto to_o = sycl::id<1>(to_offset);
-      auto from_o = sycl::id<1>(from_offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                          sycl::access::target::device>
-          to_acc(to_alloc.buffer, cgh, r, to_o);
-      sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                          sycl::access::target::device>
-          from_acc(from_alloc.buffer, cgh, r, from_o);
-      cgh.copy(from_acc, to_acc);
-    });
-  }
-  default:
-    throw std::runtime_error("dpct_memcpy: invalid direction value");
-  }
-#else
-  return q.memcpy(to_ptr, from_ptr, size, dep_events);
-#endif // DPCT_USM_LEVEL_NONE
-}
-
-// Get actual copy range and make sure it will not exceed range.
-static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                    size_t pitch) {
-  return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
-}
-
-static inline size_t get_offset(sycl::id<3> id, size_t slice,
-                                    size_t pitch) {
-  return slice * id.get(2) + pitch * id.get(1) + id.get(0);
-}
-
-/// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
-/// and \p from_range to another specified by \p to_ptr and \p to_range.
-static inline std::vector<sycl::event>
-dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-            sycl::range<3> to_range, sycl::range<3> from_range,
-            sycl::id<3> to_id, sycl::id<3> from_id,
-            sycl::range<3> size, memcpy_direction direction,
-            const std::vector<sycl::event> &dep_events = {}) {
-  // RAII for host pointer
-  class host_buffer {
-    void *_buf;
-    size_t _size;
-    sycl::queue &_q;
-    const std::vector<sycl::event> &_deps; // free operation depends
-
-  public:
-    host_buffer(size_t size, sycl::queue &q,
-                const std::vector<sycl::event> &deps)
-        : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-    void *get_ptr() const { return _buf; }
-    size_t get_size() const { return _size; }
-    ~host_buffer() {
-      if (_buf) {
-        _q.submit([&](sycl::handler &cgh) {
-          cgh.depends_on(_deps);
-          cgh.host_task([buf = _buf] { std::free(buf); });
-        });
-      }
-    }
-  };
-  std::vector<sycl::event> event_list;
-
-  size_t to_slice = to_range.get(1) * to_range.get(0),
-         from_slice = from_range.get(1) * from_range.get(0);
-  unsigned char *to_surface =
-      (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-  const unsigned char *from_surface =
-      (const unsigned char *)from_ptr +
-      get_offset(from_id, from_slice, from_range.get(0));
-
-  if (to_slice == from_slice && to_slice == size.get(1) * size.get(0)) {
-    return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                        direction, dep_events)};
-  }
-  direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-  size_t size_slice = size.get(1) * size.get(0);
-  switch (direction) {
-  case host_to_host:
-    for (size_t z = 0; z < size.get(2); ++z) {
-      unsigned char *to_ptr = to_surface;
-      const unsigned char *from_ptr = from_surface;
-      if (to_range.get(0) == from_range.get(0) &&
-          to_range.get(0) == size.get(0)) {
-        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                                         direction, dep_events));
-      } else {
-        for (size_t y = 0; y < size.get(1); ++y) {
-          event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                           direction, dep_events));
-          to_ptr += to_range.get(0);
-          from_ptr += from_range.get(0);
-        }
-      }
-      to_surface += to_slice;
-      from_surface += from_slice;
-    }
-    break;
-  case host_to_device: {
-    host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                    event_list);
-    std::vector<sycl::event> host_events;
-    if (to_slice == size_slice) {
-      // Copy host data to a temp host buffer with the shape of target.
-      host_events =
-          dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                      sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                      host_to_host, dep_events);
-    } else {
-      // Copy host data to a temp host buffer with the shape of target.
-      host_events = dpct_memcpy(
-          q, buf.get_ptr(), from_surface, to_range, from_range,
-          sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
-          // If has padding data, not sure whether it is useless. So fill temp
-          // buffer with it.
-          std::vector<sycl::event>{
-              dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                          device_to_host, dep_events)});
-    }
-    // Copy from temp host buffer to device with only one submit.
-    event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                                     buf.get_size(), host_to_device,
-                                     host_events));
-    break;
-  }
-  case device_to_host: {
-    host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                    event_list);
-    // Copy from host temp buffer to host target with reshaping.
-    event_list = dpct_memcpy(
-        q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
-        sycl::id<3>(0, 0, 0), size, host_to_host,
-        // Copy from device to temp host buffer with only one submit.
-        std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                                                 buf.get_size(),
-                                                 device_to_host, dep_events)});
-    break;
-  }
-  case device_to_device:
-#ifdef DPCT_USM_LEVEL_NONE
-  {
-    auto &mm = mem_mgr::instance();
-    auto to_alloc = mm.translate_ptr(to_surface);
-    auto from_alloc = mm.translate_ptr(from_surface);
-    size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-    size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-    event_list.push_back(q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      auto to_o = sycl::id<1>(to_offset);
-      auto from_o = sycl::id<1>(from_offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                         sycl::access::target::device>
-          to_acc(to_alloc.buffer, cgh,
-                 get_copy_range(size, to_slice, to_range.get(0)), to_o);
-      sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                         sycl::access::target::device>
-          from_acc(from_alloc.buffer, cgh,
-                   get_copy_range(size, from_slice, from_range.get(0)), from_o);
-      cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
-          size,
-          [=](sycl::id<3> id) {
-            to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                from_acc[get_offset(id, from_slice, from_range.get(0))];
-          });
-    }));
-  }
-#else
-    event_list.push_back(q.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(dep_events);
-      cgh.parallel_for<class dpct_memcpy_3d_detail>(
-          size,
-          [=](sycl::id<3> id) {
-            to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                from_surface[get_offset(id, from_slice, from_range.get(0))];
-          });
-    }));
-#endif
-  break;
-  default:
-    throw std::runtime_error("dpct_memcpy: invalid direction value");
-  }
-  return event_list;
-}
-
-/// memcpy 2D/3D matrix specified by pitched_data.
-static inline std::vector<sycl::event>
-dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-            pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-            memcpy_direction direction = automatic) {
-  return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                     sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                     sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                     size, direction);
-}
-
-/// memcpy 2D matrix with pitch.
-static inline std::vector<sycl::event>
-dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-            size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-            memcpy_direction direction = automatic) {
-  return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                     sycl::range<3>(from_pitch, y, 1),
-                     sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                     sycl::range<3>(x, y, 1), direction);
-}
-
-namespace deprecated {
-
-template <typename T, sycl::usm::alloc AllocKind>
-class usm_allocator {
-private:
-  using Alloc = sycl::usm_allocator<T, AllocKind>;
-  Alloc _impl;
-
-public:
-  using value_type = typename std::allocator_traits<Alloc>::value_type;
-  using pointer = typename std::allocator_traits<Alloc>::pointer;
-  using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
-  using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
-  using const_void_pointer =
-      typename std::allocator_traits<Alloc>::const_void_pointer;
-  using reference = typename std::allocator_traits<Alloc>::value_type &;
-  using const_reference =
-      const typename std::allocator_traits<Alloc>::value_type &;
-  using difference_type =
-      typename std::allocator_traits<Alloc>::difference_type;
-  using size_type = typename std::allocator_traits<Alloc>::size_type;
-  using propagate_on_container_copy_assignment = typename std::allocator_traits<
-      Alloc>::propagate_on_container_copy_assignment;
-  using propagate_on_container_move_assignment = typename std::allocator_traits<
-      Alloc>::propagate_on_container_move_assignment;
-  using propagate_on_container_swap =
-      typename std::allocator_traits<Alloc>::propagate_on_container_swap;
-  using is_always_equal =
-      typename std::allocator_traits<Alloc>::is_always_equal;
-
-  template <typename U> struct rebind {
-    typedef usm_allocator<U, AllocKind> other;
-  };
-
-  usm_allocator() : _impl(dpct::get_default_queue()) {}
-  ~usm_allocator() {}
-  usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
-  usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
-  pointer address(reference r) { return &r; }
-  const_pointer address(const_reference r) { return &r; }
-  pointer allocate(size_type cnt, const_void_pointer hint = nullptr) {
-    return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
-  }
-  void deallocate(pointer p, size_type cnt) {
-    std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
-  }
-  size_type max_size() const {
-    return std::allocator_traits<Alloc>::max_size(_impl);
-  }
-  bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
-  bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
-};
-
-} // namespace deprecated
-
-inline void dpct_free(void *ptr,
-                      const sycl::queue &q) {
-  if (ptr) {
-#ifdef DPCT_USM_LEVEL_NONE
-    detail::mem_mgr::instance().mem_free(ptr);
-#else
-    sycl::free(ptr, q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
-  }
-}
-} // namespace detail
-
-#ifdef DPCT_USM_LEVEL_NONE
-/// Check if the pointer \p ptr represents device pointer or not.
-///
-/// \param ptr The pointer to be checked.
-/// \returns true if \p ptr is a device pointer.
-template<class T>
-static inline bool is_device_ptr(T ptr) {
-  if constexpr (std::is_pointer<T>::value) {
-    return detail::mem_mgr::instance().is_device_ptr(ptr);
-  }
-  return false;
-}
-#endif
-
-/// Get the buffer and the offset of a piece of memory pointed to by \p ptr.
-///
-/// \param ptr Pointer to a piece of memory.
-/// If NULL is passed as an argument, an exception will be thrown.
-/// \returns a pair containing both the buffer and the offset.
-static std::pair<buffer_t, size_t> get_buffer_and_offset(const void *ptr) {
-  if (ptr) {
-    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
-    size_t offset = (byte_t *)ptr - alloc.alloc_ptr;
-    return std::make_pair(alloc.buffer, offset);
-  } else {
-    throw std::runtime_error(
-        "NULL pointer argument in get_buffer_and_offset function is invalid");
-  }
-}
-
-/// Get the data pointed from \p ptr as a 1D buffer reinterpreted as type T.
-template <typename T> static sycl::buffer<T> get_buffer(const void *ptr) {
-  if (!ptr)
-    return sycl::buffer<T>(sycl::range<1>(0));
-  auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
-  return alloc.buffer.reinterpret<T>(
-      sycl::range<1>(alloc.size / sizeof(T)));
-}
-
-/// Get the buffer of a piece of memory pointed to by \p ptr.
-///
-/// \param ptr Pointer to a piece of memory.
-/// \returns the buffer.
-static buffer_t get_buffer(const void *ptr) {
-  return detail::mem_mgr::instance().translate_ptr(ptr).buffer;
-}
-
-/// A wrapper class contains an accessor and an offset.
-template <typename dataT,
-          sycl::access_mode accessMode = sycl::access_mode::read_write>
-class access_wrapper {
-  sycl::accessor<byte_t, 1, accessMode> accessor;
-  size_t offset;
-
-public:
-  /// Construct the accessor wrapper for memory pointed by \p ptr.
-  ///
-  /// \param ptr Pointer to memory.
-  /// \param cgh The command group handler.
-  access_wrapper(const void *ptr, sycl::handler &cgh)
-      : accessor(get_buffer(ptr).get_access<accessMode>(cgh)), offset(0) {
-    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
-    offset = (byte_t *)ptr - alloc.alloc_ptr;
-  }
-
-  /// Get the device pointer.
-  ///
-  /// \returns a device pointer with offset.
-  dataT get_raw_pointer() const { return (dataT)(&accessor[0] + offset); }
-};
-
-/// Get the accessor for memory pointed by \p ptr.
-///
-/// \param ptr Pointer to memory.
-/// If NULL is passed as an argument, an exception will be thrown.
-/// \param cgh The command group handler.
-/// \returns an accessor.
-template <sycl::access_mode accessMode = sycl::access_mode::read_write>
-static sycl::accessor<byte_t, 1, accessMode>
-get_access(const void *ptr, sycl::handler &cgh) {
-  if (ptr) {
-    auto alloc = detail::mem_mgr::instance().translate_ptr(ptr);
-    return alloc.buffer.get_access<accessMode>(cgh);
-  } else {
-    throw std::runtime_error(
-        "NULL pointer argument in get_access function is invalid");
-  }
-}
-
-/// Allocate memory block on the device.
-/// \param num_bytes Number of bytes to allocate.
-/// \param q Queue to execute the allocate task.
-/// \returns A pointer to the newly allocated memory.
-template <typename T>
-static inline void *dpct_malloc(T num_bytes,
-                                sycl::queue &q = get_default_queue()) {
-  return detail::dpct_malloc(static_cast<size_t>(num_bytes), q);
-}
-
-/// Get the host pointer from a buffer that is mapped to virtual pointer ptr.
-/// \param ptr Virtual Pointer mapped to device buffer
-/// \returns A host pointer
-template <typename T> static inline T *get_host_ptr(const void *ptr) {
-  auto BufferOffset = get_buffer_and_offset(ptr);
-  auto host_ptr =
-      BufferOffset.first.get_host_access()
-          .get_pointer();
-  return (T *)(host_ptr + BufferOffset.second);
-}
-
-/// Allocate memory block for 3D array on the device.
-/// \param size Size of the memory block, in bytes.
-/// \param q Queue to execute the allocate task.
-/// \returns A pitched_data object which stores the memory info.
-static inline pitched_data
-dpct_malloc(sycl::range<3> size, sycl::queue &q = get_default_queue()) {
-  pitched_data pitch(nullptr, 0, size.get(0), size.get(1));
-  size_t pitch_size;
-  pitch.set_data_ptr(detail::dpct_malloc(pitch_size, size.get(0), size.get(1),
-                                         size.get(2), q));
-  pitch.set_pitch(pitch_size);
-  return pitch;
-}
-
-/// Allocate memory block for 2D array on the device.
-/// \param [out] pitch Aligned size of x in bytes.
-/// \param x Range in dim x.
-/// \param y Range in dim y.
-/// \param q Queue to execute the allocate task.
-/// \returns A pointer to the newly allocated memory.
-static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y,
-                                sycl::queue &q = get_default_queue()) {
-  return detail::dpct_malloc(pitch, x, y, 1, q);
-}
-
-/// free
-/// \param ptr Point to free.
-/// \param q Queue to execute the free task.
-/// \returns no return value.
-static inline void dpct_free(void *ptr,
-                             sycl::queue &q = get_default_queue()) {
-  detail::dpct_free(ptr, q);
-}
-
-/// Free the device memory pointed by a batch of pointers in \p pointers which
-/// are related to \p q after \p events completed.
-///
-/// \param pointers The pointers point to the device memory requested to be freed.
-/// \param events The events to be waited.
-/// \param q The sycl::queue the memory relates to.
-inline void async_dpct_free(const std::vector<void *> &pointers,
-                            const std::vector<sycl::event> &events,
-                            sycl::queue &q = get_default_queue()) {
-  q.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(events);
-    cgh.host_task([=] {
-      for (auto p : pointers)
-        if (p) {
-          detail::dpct_free(p, q);
-        }
-    });
-  });
-}
-
-/// Synchronously copies \p size bytes from the address specified by \p from_ptr
-/// to the address specified by \p to_ptr. The value of \p direction is used to
-/// set the copy direction, it can be \a host_to_host, \a host_to_device,
-/// \a device_to_host, \a device_to_device or \a automatic. The function will
-/// return after the copy is completed.
-///
-/// \param to_ptr Pointer to destination memory address.
-/// \param from_ptr Pointer to source memory address.
-/// \param size Number of bytes to be copied.
-/// \param direction Direction of the copy.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static void dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
-                        memcpy_direction direction = automatic,
-                        sycl::queue &q = get_default_queue()) {
-  detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction).wait();
-}
-
-/// Asynchronously copies \p size bytes from the address specified by \p
-/// from_ptr to the address specified by \p to_ptr. The value of \p direction is
-/// used to set the copy direction, it can be \a host_to_host, \a
-/// host_to_device, \a device_to_host, \a device_to_device or \a automatic. The
-/// return of the function does NOT guarantee the copy is completed.
-///
-/// \param to_ptr Pointer to destination memory address.
-/// \param from_ptr Pointer to source memory address.
-/// \param size Number of bytes to be copied.
-/// \param direction Direction of the copy.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
-                              memcpy_direction direction = automatic,
-                              sycl::queue &q = dpct::get_default_queue()) {
-  detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
-}
-
-/// Synchronously copies 2D matrix specified by \p x and \p y from the address
-/// specified by \p from_ptr to the address specified by \p to_ptr, while \p
-/// from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
-/// specified by \p from_ptr and \p to_ptr. The value of \p direction is used to
-/// set the copy direction, it can be \a host_to_host, \a host_to_device, \a
-/// device_to_host, \a device_to_device or \a automatic. The function will
-/// return after the copy is completed.
-///
-/// \param to_ptr Pointer to destination memory address.
-/// \param to_pitch Range of dim x in bytes of destination matrix.
-/// \param from_ptr Pointer to source memory address.
-/// \param from_pitch Range of dim x in bytes of source matrix.
-/// \param x Range of dim x of matrix to be copied.
-/// \param y Range of dim y of matrix to be copied.
-/// \param direction Direction of the copy.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static inline void dpct_memcpy(void *to_ptr, size_t to_pitch,
-                               const void *from_ptr, size_t from_pitch,
-                               size_t x, size_t y,
-                               memcpy_direction direction = automatic,
-                               sycl::queue &q = dpct::get_default_queue()) {
-  sycl::event::wait(detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch,
-                                            from_pitch, x, y, direction));
-}
-
-/// Asynchronously copies 2D matrix specified by \p x and \p y from the address
-/// specified by \p from_ptr to the address specified by \p to_ptr, while \p
-/// \p from_pitch and \p to_pitch are the range of dim x in bytes of the matrix
-/// specified by \p from_ptr and \p to_ptr. The value of \p direction is used to
-/// set the copy direction, it can be \a host_to_host, \a host_to_device, \a
-/// device_to_host, \a device_to_device or \a automatic. The return of the
-/// function does NOT guarantee the copy is completed.
-///
-/// \param to_ptr Pointer to destination memory address.
-/// \param to_pitch Range of dim x in bytes of destination matrix.
-/// \param from_ptr Pointer to source memory address.
-/// \param from_pitch Range of dim x in bytes of source matrix.
-/// \param x Range of dim x of matrix to be copied.
-/// \param y Range of dim y of matrix to be copied.
-/// \param direction Direction of the copy.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static inline void
-async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
-                  size_t from_pitch, size_t x, size_t y,
-                  memcpy_direction direction = automatic,
-                  sycl::queue &q = get_default_queue()) {
-  detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
-                      direction);
-}
-
-/// Synchronously copies a subset of a 3D matrix specified by \p to to another
-/// 3D matrix specified by \p from. The from and to position info are specified
-/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size.
-/// The value of \p direction is used to set the copy direction, it can be \a
-/// host_to_host, \a host_to_device, \a device_to_host, \a device_to_device or
-/// \a automatic. The function will return after the copy is completed.
-///
-/// \param to Destination matrix info.
-/// \param to_pos Position of destination.
-/// \param from Source matrix info.
-/// \param from_pos Position of destination.
-/// \param size Range of the submatrix to be copied.
-/// \param direction Direction of the copy.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static inline void dpct_memcpy(pitched_data to, sycl::id<3> to_pos,
-                               pitched_data from, sycl::id<3> from_pos,
-                               sycl::range<3> size,
-                               memcpy_direction direction = automatic,
-                               sycl::queue &q = dpct::get_default_queue()) {
-  sycl::event::wait(
-      detail::dpct_memcpy(q, to, to_pos, from, from_pos, size, direction));
-}
-
-/// Asynchronously copies a subset of a 3D matrix specified by \p to to another
-/// 3D matrix specified by \p from. The from and to position info are specified
-/// by \p from_pos and \p to_pos The copied matrix size is specified by \p size.
-/// The value of \p direction is used to set the copy direction, it can be \a
-/// host_to_host, \a host_to_device, \a device_to_host, \a device_to_device or
-/// \a automatic. The return of the function does NOT guarantee the copy is
-/// completed.
-///
-/// \param to Destination matrix info.
-/// \param to_pos Position of destination.
-/// \param from Source matrix info.
-/// \param from_pos Position of destination.
-/// \param size Range of the submatrix to be copied.
-/// \param direction Direction of the copy.
-/// \param q Queue to execute the copy task.
-/// \returns no return value.
-static inline void
-async_dpct_memcpy(pitched_data to, sycl::id<3> to_pos, pitched_data from,
-                  sycl::id<3> from_pos, sycl::range<3> size,
-                  memcpy_direction direction = automatic,
-                  sycl::queue &q = get_default_queue()) {
-  detail::dpct_memcpy(q, to, to_pos, from, from_pos, size, direction);
-}
-/**
- * @brief Sets 1 byte data \p value to the first \p size elements starting from
- * \p dev_ptr in \p q synchronously.
- * @param [in] dev_ptr Pointer to the virtual device memory address.
- * @param [in] value The value to be set.
- * @param [in] size Number of elements to be set to the value.
- * @param [in] q The queue in which the operation is done.
- */
-static void dpct_memset(void *dev_ptr, int value, size_t size,
-                        sycl::queue &q = get_default_queue()) {
-  detail::dpct_memset<unsigned char>(q, dev_ptr, value, size).wait();
-}
-
-/**
- * @brief Sets 2 bytes data \p value to the first \p size elements starting from
- * \p dev_ptr in \p q synchronously.
- * @param [in] dev_ptr Pointer to the virtual device memory address.
- * @param [in] value The value to be set.
- * @param [in] size Number of elements to be set to the value.
- * @param [in] q The queue in which the operation is done.
- */
-static void dpct_memset_d16(void *dev_ptr, unsigned short value, size_t size,
-                        sycl::queue &q = get_default_queue()) {
-  detail::dpct_memset(q, dev_ptr, value, size).wait();
-}
-/**
- * @brief Sets 4 bytes data \p value to the first \p size elements starting from
- * \p dev_ptr in \p q synchronously.
- * @param [in] dev_ptr Pointer to the virtual device memory address.
- * @param [in] value The value to be set.
- * @param [in] size Number of elements to be set to the value.
- * @param [in] q The queue in which the operation is done.
- */
-static void dpct_memset_d32(void *dev_ptr, unsigned int value, size_t size,
-                        sycl::queue &q = get_default_queue()) {
-  detail::dpct_memset(q, dev_ptr, value, size).wait();
-}
-
-/**
- * @brief Sets 1 byte data \p value to the first \p size elements starting from
- * \p dev_ptr in \p q asynchronously.
- * @param [in] dev_ptr Pointer to the virtual device memory address.
- * @param [in] value The value to be set.
- * @param [in] size Number of elements to be set to the value.
- * @param [in] q The queue in which the operation is done.
- */
-static void async_dpct_memset(void *dev_ptr, int value, size_t size,
-                              sycl::queue &q = dpct::get_default_queue()) {
-  detail::dpct_memset<unsigned char>(q, dev_ptr, value, size);
-}
-/**
- * @brief Sets 2 bytes data \p value to the first \p size elements starting from
- * \p dev_ptr in \p q asynchronously.
- * @param [in] dev_ptr Pointer to the virtual device memory address.
- * @param [in] value The value to be set.
- * @param [in] size Number of elements to be set to the value.
- * @param [in] q The queue in which the operation is done.
- */
-static void async_dpct_memset_d16(void *dev_ptr, unsigned short value, size_t size,
-                              sycl::queue &q = dpct::get_default_queue()) {
-  detail::dpct_memset(q, dev_ptr, value, size);
-}
-/**
- * @brief Sets 4 bytes data \p value to the first \p size elements starting from
- * \p dev_ptr in \p q asynchronously.
- * @param [in] dev_ptr Pointer to the virtual device memory address.
- * @param [in] value The value to be set.
- * @param [in] size Number of elements to be set to the value.
- * @param [in] q The queue in which the operation is done.
- */
-static void async_dpct_memset_d32(void *dev_ptr, unsigned int value, size_t size,
-                              sycl::queue &q = dpct::get_default_queue()) {
-  detail::dpct_memset(q, dev_ptr, value, size);
-}
-
-/**
- * @brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p ptr in \p q
- * synchronously.
- * @param [in] ptr Pointer to the virtual device memory.
- * @param [in] pitch The pitch size by number of elements, including padding.
- * @param [in] val The value to be set.
- * @param [in] x The width of memory region by number of elements.
- * @param [in] y The height of memory region by number of elements.
- * @param [in] q The queue in which the operation is done.
- */
-static inline void dpct_memset(void *ptr, size_t pitch, int val, size_t x,
-                               size_t y,
-                               sycl::queue &q = get_default_queue()) {
-  sycl::event::wait(detail::dpct_memset<unsigned char>(q, ptr, pitch, val, x, y));
-}
-/**
- * @brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by \p ptr in \p q
- * synchronously.
- * @param [in] ptr Pointer to the virtual device memory.
- * @param [in] pitch The pitch size by number of elements, including padding.
- * @param [in] val The value to be set.
- * @param [in] x The width of memory region by number of elements.
- * @param [in] y The height of memory region by number of elements.
- * @param [in] q The queue in which the operation is done.
- */
-static inline void dpct_memset_d16(void *ptr, size_t pitch, unsigned short val, size_t x,
-                               size_t y,
-                               sycl::queue &q = get_default_queue()) {
-  sycl::event::wait(detail::dpct_memset(q, ptr, pitch, val, x, y));
-}
-/**
- * @brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by \p ptr in \p q
- * synchronously.
- * @param [in] ptr Pointer to the virtual device memory.
- * @param [in] pitch The pitch size by number of elements, including padding.
- * @param [in] val The value to be set.
- * @param [in] x The width of memory region by number of elements.
- * @param [in] y The height of memory region by number of elements.
- * @param [in] q The queue in which the operation is done.
- */
-static inline void dpct_memset_d32(void *ptr, size_t pitch, unsigned int val, size_t x,
-                               size_t y,
-                               sycl::queue &q = get_default_queue()) {
-  sycl::event::wait(detail::dpct_memset(q, ptr, pitch, val, x, y));
-}
-
-/**
- * @brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p ptr in \p q
- * asynchronously.
- * @param [in] ptr Pointer to the virtual device memory.
- * @param [in] pitch The pitch size by number of elements, including padding.
- * @param [in] val The value to be set.
- * @param [in] x The width of memory region by number of elements.
- * @param [in] y The height of memory region by number of elements.
- * @param [in] q The queue in which the operation is done.
- */
-static inline void async_dpct_memset(void *ptr, size_t pitch, int val, size_t x,
-                                     size_t y,
-                                     sycl::queue &q = get_default_queue()) {
-  detail::dpct_memset<unsigned char>(q, ptr, pitch, val, x, y);
-}
-
-/**
- * @brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by \p ptr in \p q
- * asynchronously.
- * @param [in] ptr Pointer to the virtual device memory.
- * @param [in] pitch The pitch size by number of elements, including padding.
- * @param [in] val The value to be set.
- * @param [in] x The width of memory region by number of elements.
- * @param [in] y The height of memory region by number of elements.
- * @param [in] q The queue in which the operation is done.
- */
-static inline void async_dpct_memset_d16(void *ptr, size_t pitch,
-                                         unsigned short val, size_t x, size_t y,
-                                         sycl::queue &q = get_default_queue()) {
-  detail::dpct_memset(q, ptr, pitch, val, x, y);
-}
-
-/**
- * @brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by \p ptr in \p q
- * asynchronously.
- * @param [in] ptr Pointer to the virtual device memory.
- * @param [in] pitch The pitch size by number of elements, including padding.
- * @param [in] val The value to be set.
- * @param [in] x The width of memory region by number of elements.
- * @param [in] y The height of memory region by number of elements.
- * @param [in] q The queue in which the operation is done.
- */
-static inline void async_dpct_memset_d32(void *ptr, size_t pitch,
-                                         unsigned int val, size_t x, size_t y,
-                                         sycl::queue &q = get_default_queue()) {
-  detail::dpct_memset(q, ptr, pitch, val, x, y);
-}
-
-/**
- * @brief Sets 1 byte data \p value to the 3D memory region pointed by \p data in \p q
- * synchronously.
- * @param [in] data Pointer to the pitched device memory region.
- * @param [in] value The value to be set.
- * @param [in] size 3D memory region by number of elements.
- * @param [in] q The queue in which the operation is done.
- */
-static inline void dpct_memset(pitched_data pitch, int val,
-                               sycl::range<3> size,
-                               sycl::queue &q = get_default_queue()) {
-  sycl::event::wait(detail::dpct_memset<unsigned char>(q, pitch, val, size));
-}
-
-/**
- * @brief Sets 1 byte data \p value to the 3D memory region pointed by \p data in \p q
- * asynchronously.
- * @param [in] data Pointer to the pitched device memory region.
- * @param [in] value The value to be set.
- * @param [in] size 3D memory region by number of elements.
- * @param [in] q The queue in which the operation is done.
- */
-static inline void async_dpct_memset(pitched_data pitch, int val,
-                                     sycl::range<3> size,
-                                     sycl::queue &q = get_default_queue()) {
-  detail::dpct_memset<unsigned char>(q, pitch, val, size);
-}
-
-/// dpct accessor used as device function parameter.
-template <class T, memory_region Memory, size_t Dimension> class accessor;
-template <class T, memory_region Memory> class accessor<T, Memory, 3> {
-public:
-  using memory_t = detail::memory_traits<Memory, T>;
-  using element_t = typename memory_t::element_t;
-  using pointer_t = typename memory_t::pointer_t;
-  using accessor_t = typename memory_t::template accessor_t<3>;
-  accessor(pointer_t data, const sycl::range<3> &in_range)
-      : _data(data), _range(in_range) {}
-  template <memory_region M = Memory>
-  accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
-      : accessor(acc, acc.get_range()) {}
-  accessor(const accessor_t &acc, const sycl::range<3> &in_range)
-      : accessor(acc.get_pointer(), in_range) {}
-  accessor<T, Memory, 2> operator[](size_t index) const {
-    sycl::range<2> sub(_range.get(1), _range.get(2));
-    return accessor<T, Memory, 2>(_data + index * sub.size(), sub);
-  }
-
-  pointer_t get_ptr() const { return _data; }
-
-private:
-  pointer_t _data;
-  sycl::range<3> _range;
-};
-template <class T, memory_region Memory> class accessor<T, Memory, 2> {
-public:
-  using memory_t = detail::memory_traits<Memory, T>;
-  using element_t = typename memory_t::element_t;
-  using pointer_t = typename memory_t::pointer_t;
-  using accessor_t = typename memory_t::template accessor_t<2>;
-  accessor(pointer_t data, const sycl::range<2> &in_range)
-      : _data(data), _range(in_range) {}
-  template <memory_region M = Memory>
-  accessor(typename std::enable_if<M != local, const accessor_t>::type &acc)
-      : accessor(acc, acc.get_range()) {}
-  accessor(const accessor_t &acc, const sycl::range<2> &in_range)
-      : accessor(acc.get_pointer(), in_range) {}
-
-  pointer_t operator[](size_t index) const {
-    return _data + _range.get(1) * index;
-  }
-
-  pointer_t get_ptr() const { return _data; }
-
-private:
-  pointer_t _data;
-  sycl::range<2> _range;
-};
-
-namespace detail {
-/// Device variable with address space of shared, global or constant.
-template <class T, memory_region Memory, size_t Dimension>
-class device_memory {
-public:
-  using accessor_t =
-      typename detail::memory_traits<Memory, T>::template accessor_t<Dimension>;
-  using value_t = typename detail::memory_traits<Memory, T>::value_t;
-  using dpct_accessor_t = dpct::accessor<T, Memory, Dimension>;
-
-  device_memory() : device_memory(sycl::range<Dimension>(1)) {}
-
-  /// Constructor of 1-D array with initializer list
-  device_memory(
-      const sycl::range<Dimension> &in_range,
-      std::initializer_list<value_t> &&init_list)
-      : device_memory(in_range) {
-    assert(init_list.size() <= in_range.size());
-    _host_ptr = (value_t *)std::malloc(_size);
-    std::memset(_host_ptr, 0, _size);
-    std::memcpy(_host_ptr, init_list.begin(), init_list.size() * sizeof(T));
-  }
-
-  /// Constructor of 2-D array with initializer list
-  template <size_t D = Dimension>
-  device_memory(
-      const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
-      std::initializer_list<std::initializer_list<value_t>> &&init_list)
-      : device_memory(in_range) {
-    assert(init_list.size() <= in_range[0]);
-    _host_ptr = (value_t *)std::malloc(_size);
-    std::memset(_host_ptr, 0, _size);
-    auto tmp_data = _host_ptr;
-    for (auto sub_list : init_list) {
-      assert(sub_list.size() <= in_range[1]);
-      std::memcpy(tmp_data, sub_list.begin(), sub_list.size() * sizeof(T));
-      tmp_data += in_range[1];
-    }
-  }
-
-  /// Constructor with range
-  device_memory(const sycl::range<Dimension> &range_in)
-      : _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
-        _host_ptr(nullptr), _device_ptr(nullptr) {
-    static_assert(
-        (Memory == global) || (Memory == constant) || (Memory == shared),
-        "device memory region should be global, constant or shared");
-    // Make sure that singleton class mem_mgr and dev_mgr will destruct later
-    // than this.
-    detail::mem_mgr::instance();
-    dev_mgr::instance();
-  }
-
-  /// Constructor with range
-  template <class... Args>
-  device_memory(Args... Arguments)
-      : device_memory(sycl::range<Dimension>(Arguments...)) {}
-
-  ~device_memory() {
-    if (_device_ptr && !_reference)
-      dpct::dpct_free(_device_ptr);
-    if (_host_ptr)
-      std::free(_host_ptr);
-  }
-
-  /// Allocate memory with default queue, and init memory if has initial value.
-  void init() {
-    init(dpct::get_default_queue());
-  }
-  /// Allocate memory with specified queue, and init memory if has initial value.
-  void init(sycl::queue &q) {
-    if (_device_ptr)
-      return;
-    if (!_size)
-      return;
-    allocate_device(q);
-    if (_host_ptr)
-      detail::dpct_memcpy(q, _device_ptr, _host_ptr, _size, host_to_device);
-  }
-
-  /// The variable is assigned to a device pointer.
-  void assign(value_t *src, size_t size) {
-    this->~device_memory();
-    new (this) device_memory(src, size);
-  }
-
-  /// Get memory pointer of the memory object, which is virtual pointer when
-  /// usm is not used, and device pointer when usm is used.
-  value_t *get_ptr() {
-    return get_ptr(get_default_queue());
-  }
-  /// Get memory pointer of the memory object, which is virtual pointer when
-  /// usm is not used, and device pointer when usm is used.
-  value_t *get_ptr(sycl::queue &q) {
-    init(q);
-    return _device_ptr;
-  }
-
-  /// Get the device memory object size in bytes.
-  size_t get_size() { return _size; }
-
-  template <size_t D = Dimension>
-  typename std::enable_if<D == 1, T>::type &operator[](size_t index) {
-    init();
-#ifdef DPCT_USM_LEVEL_NONE
-    return dpct::get_buffer<typename std::enable_if<D == 1, T>::type>(
-               _device_ptr)
-        .template get_access<sycl::access_mode::read_write>()[index];
-#else
-    return _device_ptr[index];
-#endif // DPCT_USM_LEVEL_NONE
-  }
-
-#ifdef DPCT_USM_LEVEL_NONE
-  /// Get sycl::accessor for the device memory object when usm is not used.
-  accessor_t get_access(sycl::handler &cgh) {
-    return get_buffer(_device_ptr)
-        .template reinterpret<T, Dimension>(_range)
-        .template get_access<detail::memory_traits<Memory, T>::mode,
-                             detail::memory_traits<Memory, T>::target>(cgh);
-  }
-#else
-  /// Get dpct::accessor with dimension info for the device memory object
-  /// when usm is used and dimension is greater than 1.
-  template <size_t D = Dimension>
-  typename std::enable_if<D != 1, dpct_accessor_t>::type
-  get_access(sycl::handler &cgh) {
-    return dpct_accessor_t((T *)_device_ptr, _range);
-  }
-#endif // DPCT_USM_LEVEL_NONE
-
-private:
-  device_memory(value_t *memory_ptr, size_t size)
-      : _size(size), _range(size / sizeof(T)), _reference(true),
-        _device_ptr(memory_ptr) {}
-
-  void allocate_device(sycl::queue &q) {
-#ifndef DPCT_USM_LEVEL_NONE
-    if (Memory == shared) {
-      _device_ptr = (value_t *)sycl::malloc_shared(
-          _size, q.get_device(), q.get_context());
-      return;
-    }
-#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
-    if (Memory == constant) {
-      _device_ptr = (value_t *)sycl::malloc_device(
-          _size, q.get_device(), q.get_context(),
-          sycl::ext::oneapi::property::usm::device_read_only());
-      return;
-    }
-#endif
-#endif
-    _device_ptr = (value_t *)detail::dpct_malloc(_size, q);
-  }
-
-  size_t _size;
-  sycl::range<Dimension> _range;
-  bool _reference;
-  value_t *_host_ptr;
-  value_t *_device_ptr;
-};
-template <class T, memory_region Memory>
-class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {
-public:
-  using base = device_memory<T, Memory, 1>;
-  using value_t = typename base::value_t;
-  using accessor_t =
-      typename detail::memory_traits<Memory, T>::template accessor_t<0>;
-
-  /// Constructor with initial value.
-  device_memory(const value_t &val) : base(sycl::range<1>(1), {val}) {}
-
-  /// Default constructor
-  device_memory() : base(1) {}
-
-#ifdef DPCT_USM_LEVEL_NONE
-  /// Get sycl::accessor for the device memory object when usm is not used.
-  accessor_t get_access(sycl::handler &cgh) {
-    auto buf = get_buffer(base::get_ptr())
-                   .template reinterpret<T, 1>(sycl::range<1>(1));
-    return accessor_t(buf, cgh);
-  }
-#endif // DPCT_USM_LEVEL_NONE
-};
-}
-
-template <class T, size_t Dimension>
-using global_memory = detail::device_memory<T, global, Dimension>;
-template <class T, size_t Dimension>
-using constant_memory = detail::device_memory<T, constant, Dimension>;
-template <class T, size_t Dimension>
-using shared_memory = detail::device_memory<T, shared, Dimension>;
-
-// dpct::deprecated:: is for functionality that was introduced for compatibility
-// purpose, but relies on deprecated C++ features, which are either removed or
-// will be removed in the future standards.
-// Direct use of deprecated functionality in this namespace should be avoided.
-namespace deprecated {
-
-template <typename T>
-using usm_host_allocator = detail::deprecated::usm_allocator<T, sycl::usm::alloc::host>;
-
-template <typename T>
-using usm_device_allocator = detail::deprecated::usm_allocator<T, sycl::usm::alloc::shared>;
-} // namespace deprecated
-
-class pointer_attributes {
-public:
-  void init(const void *ptr,
-              sycl::queue &q = dpct::get_default_queue()) {
-#ifdef DPCT_USM_LEVEL_NONE
-    throw std::runtime_error(
-          "dpct::pointer_attributes: only works for USM pointer.");
-#else
-    memory_type = sycl::get_pointer_type(ptr, q.get_context());
-    if (memory_type == sycl::usm::alloc::unknown) {
-      device_id = -1;
-      return;
-    }
-    device_pointer = (memory_type !=
-                        sycl::usm::alloc::unknown) ? ptr : nullptr;
-    host_pointer = (memory_type !=
-                        sycl::usm::alloc::unknown) &&
-                   (memory_type != sycl::usm::alloc::device) ? ptr : nullptr;
-    sycl::device device_obj = sycl::get_pointer_device(ptr, q.get_context());
-    device_id = dpct::dev_mgr::instance().get_device_id(device_obj);
-#endif
-  }
-
-  sycl::usm::alloc get_memory_type() {
-    return memory_type;
-  }
-
-  const void *get_device_pointer() {
-    return device_pointer;
-  }
-
-  const void *get_host_pointer() {
-    return host_pointer;
-  }
-
-  bool is_memory_shared() {
-    return memory_type == sycl::usm::alloc::shared;
-  }
-
-  unsigned int get_device_id() {
-    return device_id;
-  }
-
-private:
-  sycl::usm::alloc memory_type = sycl::usm::alloc::unknown;
-  const void *device_pointer = nullptr;
-  const void *host_pointer = nullptr;
-  unsigned int device_id = -1;
-};
-} // namespace dpct
-#endif // __DPCT_MEMORY_HPP__
diff --git a/dpct/rng_utils.hpp b/dpct/rng_utils.hpp
deleted file mode 100644
index 6c79ca564639c..0000000000000
--- a/dpct/rng_utils.hpp
+++ /dev/null
@@ -1,535 +0,0 @@
-//==---- rng_utils.hpp ----------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_RNG_UTILS_HPP__
-#define __DPCT_RNG_UTILS_HPP__
-
-#include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
-#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
-#include <oneapi/mkl/rng/device.hpp>
-#endif
-#include "device.hpp"
-#include "lib_common_utils.hpp"
-
-namespace dpct {
-namespace rng {
-#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
-namespace device {
-/// The random number generator on device.
-/// \tparam engine_t The device random number generator engine. It can only be
-/// oneapi::mkl::rng::device::mrg32k3a<1> or
-/// oneapi::mkl::rng::device::mrg32k3a<4> or
-/// oneapi::mkl::rng::device::philox4x32x10<1> or
-/// oneapi::mkl::rng::device::philox4x32x10<4>.
-template <typename engine_t> class rng_generator {
-  static_assert(
-      std::disjunction_v<
-          std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
-          std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<4>>,
-          std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
-          std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<4>>,
-          std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>,
-      "engine_t can only be oneapi::mkl::rng::device::mrg32k3a<1> or "
-      "oneapi::mkl::rng::device::mrg32k3a<4> or "
-      "oneapi::mkl::rng::device::philox4x32x10<1> or "
-      "oneapi::mkl::rng::device::philox4x32x10<4> or "
-      "oneapi::mkl::rng::device::mcg59<1>.");
-  static constexpr bool _is_engine_vec_size_one = std::disjunction_v<
-      std::is_same<engine_t, oneapi::mkl::rng::device::mrg32k3a<1>>,
-      std::is_same<engine_t, oneapi::mkl::rng::device::philox4x32x10<1>>,
-      std::is_same<engine_t, oneapi::mkl::rng::device::mcg59<1>>>;
-  static constexpr std::uint64_t default_seed = 0;
-  oneapi::mkl::rng::device::bits<std::uint32_t> _distr_bits;
-  oneapi::mkl::rng::device::uniform_bits<std::uint32_t> _distr_uniform_bits;
-  oneapi::mkl::rng::device::gaussian<float> _distr_gaussian_float;
-  oneapi::mkl::rng::device::gaussian<double> _distr_gaussian_double;
-  oneapi::mkl::rng::device::lognormal<float> _distr_lognormal_float;
-  oneapi::mkl::rng::device::lognormal<double> _distr_lognormal_double;
-  oneapi::mkl::rng::device::poisson<std::uint32_t> _distr_poisson;
-  oneapi::mkl::rng::device::uniform<float> _distr_uniform_float;
-  oneapi::mkl::rng::device::uniform<double> _distr_uniform_double;
-  engine_t _engine;
-
-public:
-  /// Default constructor of rng_generator
-  rng_generator() { _engine = engine_t(default_seed); }
-  /// Constructor of rng_generator if engine type is not mcg59
-  /// \param [in] seed The seed to initialize the engine state.
-  /// \param [in] num_to_skip Set the number of elements need to be skipped.
-  /// The number is calculated as: num_to_skip[0] + num_to_skip[1] * 2^64 +
-  /// num_to_skip[2] * 2^128 + ... + num_to_skip[n-1] * 2^(64*(n-1))
-  template <typename T = engine_t,
-            typename std::enable_if<!std::is_same_v<
-                T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
-  rng_generator(std::uint64_t seed,
-                std::initializer_list<std::uint64_t> num_to_skip) {
-    _engine = engine_t(seed, num_to_skip);
-  }
-  /// Constructor of rng_generator if engine type is mcg59
-  /// \param [in] seed The seed to initialize the engine state.
-  /// \param [in] num_to_skip Set the number of elements need to be skipped.
-  template <typename T = engine_t,
-            typename std::enable_if<std::is_same_v<
-                T, oneapi::mkl::rng::device::mcg59<1>>>::type * = nullptr>
-  rng_generator(std::uint64_t seed, std::uint64_t num_to_skip) {
-    _engine = engine_t(seed, num_to_skip);
-  }
-
-  /// Generate random number(s) obeys distribution \tparam distr_t.
-  /// \tparam T The distribution of the random number. It can only be
-  /// oneapi::mkl::rng::device::bits<std::uint32_t>,
-  /// oneapi::mkl::rng::device::uniform_bits<std::uint32_t>,
-  /// oneapi::mkl::rng::device::gaussian<float>,
-  /// oneapi::mkl::rng::device::gaussian<double>,
-  /// oneapi::mkl::rng::device::lognormal<float>,
-  /// oneapi::mkl::rng::device::lognormal<double>,
-  /// oneapi::mkl::rng::device::poisson<std::uint32_t>,
-  /// oneapi::mkl::rng::device::uniform<float> or
-  /// oneapi::mkl::rng::device::uniform<double>
-  /// \tparam vec_size The length of the return vector. It can only be 1, 2
-  /// or 4.
-  /// \param distr_params The parameter(s) for lognormal or poisson
-  /// distribution.
-  /// \return The vector of the random number(s).
-  template <typename distr_t, int vec_size, class... distr_params_t>
-  auto generate(distr_params_t... distr_params) {
-    static_assert(vec_size == 1 || vec_size == 2 || vec_size == 4,
-                  "vec_size is not supported.");
-    static_assert(
-        std::disjunction_v<
-            std::is_same<distr_t,
-                         oneapi::mkl::rng::device::bits<std::uint32_t>>,
-            std::is_same<distr_t,
-                         oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<float>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::gaussian<double>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<float>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::lognormal<double>>,
-            std::is_same<distr_t,
-                         oneapi::mkl::rng::device::poisson<std::uint32_t>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::uniform<float>>,
-            std::is_same<distr_t, oneapi::mkl::rng::device::uniform<double>>>,
-        "distribution is not supported.");
-
-    if constexpr (std::is_same_v<
-                      distr_t, oneapi::mkl::rng::device::bits<std::uint32_t>>) {
-      return generate_vec<vec_size>(_distr_bits);
-    }
-    if constexpr (std::is_same_v<
-                      distr_t,
-                      oneapi::mkl::rng::device::uniform_bits<std::uint32_t>>) {
-      return generate_vec<vec_size>(_distr_uniform_bits);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::gaussian<float>>) {
-      return generate_vec<vec_size>(_distr_gaussian_float);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::gaussian<double>>) {
-      return generate_vec<vec_size>(_distr_gaussian_double);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::lognormal<float>>) {
-      return generate_vec<vec_size>(_distr_lognormal_float, distr_params...,
-                                    0.0f, 1.0f);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::lognormal<double>>) {
-      return generate_vec<vec_size>(_distr_lognormal_double, distr_params...,
-                                    0.0, 1.0);
-    }
-    if constexpr (std::is_same_v<distr_t, oneapi::mkl::rng::device::poisson<
-                                              std::uint32_t>>) {
-      return generate_vec<vec_size>(_distr_poisson, distr_params...);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::uniform<float>>) {
-      return generate_vec<vec_size>(_distr_uniform_float);
-    }
-    if constexpr (std::is_same_v<distr_t,
-                                 oneapi::mkl::rng::device::uniform<double>>) {
-      return generate_vec<vec_size>(_distr_uniform_double);
-    }
-  }
-
-  /// Get the random number generator engine.
-  /// \return The reference of the internal random number generator engine.
-  engine_t &get_engine() { return _engine; }
-
-private:
-  template <int vec_size, typename distr_t, class... distr_params_t>
-  auto generate_vec(distr_t &distr, distr_params_t... distr_params) {
-    if constexpr (sizeof...(distr_params_t)) {
-      typename distr_t::param_type pt(distr_params...);
-      distr.param(pt);
-    }
-    if constexpr (vec_size == 4) {
-      if constexpr (_is_engine_vec_size_one) {
-        sycl::vec<typename distr_t::result_type, 4> res;
-        res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
-        res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
-        res.z() = oneapi::mkl::rng::device::generate(distr, _engine);
-        res.w() = oneapi::mkl::rng::device::generate(distr, _engine);
-        return res;
-      } else {
-        return oneapi::mkl::rng::device::generate(distr, _engine);
-      }
-    } else if constexpr (vec_size == 1) {
-      if constexpr (_is_engine_vec_size_one) {
-        return oneapi::mkl::rng::device::generate(distr, _engine);
-      } else {
-        return oneapi::mkl::rng::device::generate_single(distr, _engine);
-      }
-    } else if constexpr (vec_size == 2) {
-      if constexpr (_is_engine_vec_size_one) {
-        sycl::vec<typename distr_t::result_type, 2> res;
-        res.x() = oneapi::mkl::rng::device::generate(distr, _engine);
-        res.y() = oneapi::mkl::rng::device::generate(distr, _engine);
-        return res;
-      } else {
-        sycl::vec<typename distr_t::result_type, 2> res;
-        res.x() = oneapi::mkl::rng::device::generate_single(distr, _engine);
-        res.y() = oneapi::mkl::rng::device::generate_single(distr, _engine);
-        return res;
-      }
-    }
-  }
-};
-
-} // namespace device
-#endif
-
-namespace host {
-namespace detail {
-class rng_generator_base {
-public:
-  /// Set the seed of host rng_generator.
-  /// \param seed The engine seed.
-  virtual void set_seed(const std::uint64_t seed) = 0;
-
-  /// Set the dimensions of host rng_generator.
-  /// \param dimensions The engine dimensions.
-  virtual void set_dimensions(const std::uint32_t dimensions) = 0;
-
-  /// Set the queue of host rng_generator.
-  /// \param queue The engine queue.
-  virtual void set_queue(sycl::queue *queue) = 0;
-
-  /// Generate unsigned int random number(s) with 'uniform_bits' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  virtual inline void generate_uniform_bits(unsigned int *output,
-                                            std::int64_t n) = 0;
-
-  /// Generate unsigned long long random number(s) with 'uniform_bits'
-  /// distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  virtual inline void generate_uniform_bits(unsigned long long *output,
-                                            std::int64_t n) = 0;
-
-  /// Generate float random number(s) with 'lognormal' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param m Mean of associated normal distribution
-  /// \param s Standard deviation of associated normal distribution.
-  virtual inline void generate_lognormal(float *output, std::int64_t n, float m,
-                                         float s) = 0;
-
-  /// Generate double random number(s) with 'lognormal' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param m Mean of associated normal distribution
-  /// \param s Standard deviation of associated normal distribution.
-  virtual inline void generate_lognormal(double *output, std::int64_t n,
-                                         double m, double s) = 0;
-
-  /// Generate float random number(s) with 'gaussian' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param mean Mean of normal distribution
-  /// \param stddev Standard deviation of normal distribution.
-  virtual inline void generate_gaussian(float *output, std::int64_t n,
-                                        float mean, float stddev) = 0;
-
-  /// Generate double random number(s) with 'gaussian' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param mean Mean of normal distribution
-  /// \param stddev Standard deviation of normal distribution.
-  virtual inline void generate_gaussian(double *output, std::int64_t n,
-                                        double mean, double stddev) = 0;
-
-  /// Generate unsigned int random number(s) with 'poisson' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param lambda Lambda for the Poisson distribution.
-  virtual inline void generate_poisson(unsigned int *output, std::int64_t n,
-                                       double lambda) = 0;
-
-  /// Generate float random number(s) with 'uniform' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  virtual inline void generate_uniform(float *output, std::int64_t n) = 0;
-
-  /// Generate double random number(s) with 'uniform' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  virtual inline void generate_uniform(double *output, std::int64_t n) = 0;
-
-  /// Skip ahead several random number(s).
-  /// \param num_to_skip The number of random numbers to be skipped.
-  virtual void skip_ahead(const std::uint64_t num_to_skip) = 0;
-
-  /// Set the direction numbers of host rng_generator. Only Sobol engine
-  /// supports this method.
-  /// \param direction_numbers The engine direction numbers.
-  virtual void set_direction_numbers(
-      const std::vector<std::uint32_t> &direction_numbers) = 0;
-
-protected:
-  sycl::queue *_queue{&dpct::get_default_queue()};
-  std::uint64_t _seed{0};
-  std::uint32_t _dimensions{1};
-  std::vector<std::uint32_t> _direction_numbers;
-};
-
-/// The random number generator on host.
-template <typename engine_t = oneapi::mkl::rng::philox4x32x10>
-class rng_generator : public rng_generator_base {
-public:
-  /// Constructor of rng_generator.
-  rng_generator() : _engine(create_engine(_queue, _seed, _dimensions)) {}
-
-  /// Set the seed of host rng_generator.
-  /// \param seed The engine seed.
-  void set_seed(const std::uint64_t seed) {
-    if (seed == _seed) {
-      return;
-    }
-    _seed = seed;
-    _engine = create_engine(_queue, _seed, _dimensions);
-  }
-
-  /// Set the dimensions of host rng_generator.
-  /// \param dimensions The engine dimensions.
-  void set_dimensions(const std::uint32_t dimensions) {
-    if (dimensions == _dimensions) {
-      return;
-    }
-    _dimensions = dimensions;
-    _engine = create_engine(_queue, _seed, _dimensions);
-  }
-
-  /// Set the queue of host rng_generator.
-  /// \param queue The engine queue.
-  void set_queue(sycl::queue *queue) {
-    if (queue == _queue) {
-      return;
-    }
-    _queue = queue;
-    _engine = create_engine(_queue, _seed, _dimensions);
-  }
-
-  /// Set the direction numbers of Sobol host rng_generator.
-  /// \param direction_numbers The user-defined direction numbers.
-  void
-  set_direction_numbers(const std::vector<std::uint32_t> &direction_numbers) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-    if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::sobol>) {
-      if (direction_numbers == _direction_numbers) {
-        return;
-      }
-      _direction_numbers = direction_numbers;
-      _engine = oneapi::mkl::rng::sobol(*_queue, _direction_numbers);
-    } else {
-      throw std::runtime_error("Only Sobol engine supports this method.");
-    }
-#endif
-  }
-
-  /// Generate unsigned int random number(s) with 'uniform_bits' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  inline void generate_uniform_bits(unsigned int *output, std::int64_t n) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-    static_assert(sizeof(unsigned int) == sizeof(std::uint32_t));
-    generate<oneapi::mkl::rng::uniform_bits<std::uint32_t>>(
-        (std::uint32_t *)output, n);
-#endif
-  }
-
-  /// Generate unsigned long long random number(s) with 'uniform_bits'
-  /// distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  inline void generate_uniform_bits(unsigned long long *output,
-                                    std::int64_t n) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-    static_assert(sizeof(unsigned long long) == sizeof(std::uint64_t));
-    generate<oneapi::mkl::rng::uniform_bits<std::uint64_t>>(
-        (std::uint64_t *)output, n);
-#endif
-  }
-
-  /// Generate float random number(s) with 'lognormal' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param m Mean of associated normal distribution
-  /// \param s Standard deviation of associated normal distribution.
-  inline void generate_lognormal(float *output, std::int64_t n, float m,
-                                 float s) {
-    generate<oneapi::mkl::rng::lognormal<float>>(output, n, m, s);
-  }
-
-  /// Generate double random number(s) with 'lognormal' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param m Mean of associated normal distribution
-  /// \param s Standard deviation of associated normal distribution.
-  inline void generate_lognormal(double *output, std::int64_t n, double m,
-                                 double s) {
-    generate<oneapi::mkl::rng::lognormal<double>>(output, n, m, s);
-  }
-
-  /// Generate float random number(s) with 'gaussian' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param mean Mean of normal distribution
-  /// \param stddev Standard deviation of normal distribution.
-  inline void generate_gaussian(float *output, std::int64_t n, float mean,
-                                float stddev) {
-    generate<oneapi::mkl::rng::gaussian<float>>(output, n, mean, stddev);
-  }
-
-  /// Generate double random number(s) with 'gaussian' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param mean Mean of normal distribution
-  /// \param stddev Standard deviation of normal distribution.
-  inline void generate_gaussian(double *output, std::int64_t n, double mean,
-                                double stddev) {
-    generate<oneapi::mkl::rng::gaussian<double>>(output, n, mean, stddev);
-  }
-
-  /// Generate unsigned int random number(s) with 'poisson' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  /// \param lambda Lambda for the Poisson distribution.
-  inline void generate_poisson(unsigned int *output, std::int64_t n,
-                               double lambda) {
-    generate<oneapi::mkl::rng::poisson<unsigned int>>(output, n, lambda);
-  }
-
-  /// Generate float random number(s) with 'uniform' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  inline void generate_uniform(float *output, std::int64_t n) {
-    generate<oneapi::mkl::rng::uniform<float>>(output, n);
-  }
-
-  /// Generate double random number(s) with 'uniform' distribution.
-  /// \param output The pointer of the first random number.
-  /// \param n The number of random numbers.
-  inline void generate_uniform(double *output, std::int64_t n) {
-    generate<oneapi::mkl::rng::uniform<double>>(output, n);
-  }
-
-  /// Skip ahead several random number(s).
-  /// \param num_to_skip The number of random numbers to be skipped.
-  void skip_ahead(const std::uint64_t num_to_skip) {
-#ifndef __INTEL_MKL__
-    oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
-#else
-    if constexpr (std::is_same_v<engine_t, oneapi::mkl::rng::mt2203>)
-      throw std::runtime_error("no skip_ahead method of mt2203 engine.");
-    else
-      oneapi::mkl::rng::skip_ahead(_engine, num_to_skip);
-#endif
-  }
-
-private:
-  static inline engine_t create_engine(sycl::queue *queue,
-                                       const std::uint64_t seed,
-                                       const std::uint32_t dimensions) {
-#ifdef __INTEL_MKL__
-    return std::is_same_v<engine_t, oneapi::mkl::rng::sobol>
-               ? engine_t(*queue, dimensions)
-               : engine_t(*queue, seed);
-#else
-    return engine_t(*queue, seed);
-#endif
-  }
-
-  template <typename distr_t, typename buffer_t, class... distr_params_t>
-  void generate(buffer_t *output, const std::int64_t n,
-                const distr_params_t... distr_params) {
-    auto output_buf = dpct::detail::get_memory<buffer_t>(output);
-    oneapi::mkl::rng::generate(distr_t(distr_params...), _engine, n,
-                               output_buf);
-  }
-  engine_t _engine{};
-};
-} // namespace detail
-} // namespace host
-
-enum class random_engine_type {
-  philox4x32x10,
-  mrg32k3a,
-  mt2203,
-  mt19937,
-  sobol,
-  mcg59
-};
-
-typedef std::shared_ptr<rng::host::detail::rng_generator_base> host_rng_ptr;
-
-/// Create a host random number generator.
-/// \param type The random engine type.
-/// \return The pointer of random number generator.
-inline host_rng_ptr create_host_rng(const random_engine_type type) {
-  switch (type) {
-  case random_engine_type::philox4x32x10:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::philox4x32x10>>();
-  case random_engine_type::mrg32k3a:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::mrg32k3a>>();
-#ifndef __INTEL_MKL__
-    throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) "
-                             "Interfaces Project does not support this API.");
-#else
-  case random_engine_type::mt2203:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::mt2203>>();
-  case random_engine_type::mt19937:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::mt19937>>();
-  case random_engine_type::sobol:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::sobol>>();
-  case random_engine_type::mcg59:
-    return std::make_shared<
-        rng::host::detail::rng_generator<oneapi::mkl::rng::mcg59>>();
-#endif
-  }
-}
-} // namespace rng
-} // namespace dpct
-
-#endif // __DPCT_RNG_UTILS_HPP__
diff --git a/dpct/sparse_utils.hpp b/dpct/sparse_utils.hpp
deleted file mode 100644
index b4f1ae4bcf330..0000000000000
--- a/dpct/sparse_utils.hpp
+++ /dev/null
@@ -1,1385 +0,0 @@
-//==---- sparse_utils.hpp -------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_SPARSE_UTILS_HPP__
-#define __DPCT_SPARSE_UTILS_HPP__
-
-#include "lib_common_utils.hpp"
-#include <oneapi/mkl.hpp>
-#include <sycl/sycl.hpp>
-
-namespace dpct {
-namespace sparse {
-/// Describes properties of a sparse matrix.
-/// The properties are matrix type, diag, uplo and index base.
-class matrix_info {
-public:
-  /// Matrix types are:
-  /// ge: General matrix
-  /// sy: Symmetric matrix
-  /// he: Hermitian matrix
-  /// tr: Triangular matrix
-  enum class matrix_type : int { ge = 0, sy, he, tr };
-
-  auto get_matrix_type() const { return _matrix_type; }
-  auto get_diag() const { return _diag; }
-  auto get_uplo() const { return _uplo; }
-  auto get_index_base() const { return _index_base; }
-  void set_matrix_type(matrix_type mt) { _matrix_type = mt; }
-  void set_diag(oneapi::mkl::diag d) { _diag = d; }
-  void set_uplo(oneapi::mkl::uplo u) { _uplo = u; }
-  void set_index_base(oneapi::mkl::index_base ib) { _index_base = ib; }
-
-private:
-  matrix_type _matrix_type = matrix_type::ge;
-  oneapi::mkl::diag _diag = oneapi::mkl::diag::nonunit;
-  oneapi::mkl::uplo _uplo = oneapi::mkl::uplo::upper;
-  oneapi::mkl::index_base _index_base = oneapi::mkl::index_base::zero;
-};
-
-enum class conversion_scope : int { index = 0, index_and_value };
-
-namespace detail {
-template <template <typename> typename functor_t, typename... args_t>
-inline void spblas_shim(library_data_t type, args_t &&...args) {
-  switch (type) {
-  case library_data_t::real_float: {
-    functor_t<float>()(std::forward<args_t>(args)...);
-    break;
-  }
-  case library_data_t::real_double: {
-    functor_t<double>()(std::forward<args_t>(args)...);
-    break;
-  }
-  case library_data_t::complex_float: {
-    functor_t<std::complex<float>>()(std::forward<args_t>(args)...);
-    break;
-  }
-  case library_data_t::complex_double: {
-    functor_t<std::complex<double>>()(std::forward<args_t>(args)...);
-    break;
-  }
-  default:
-    throw std::runtime_error("The data type is not supported.");
-  }
-}
-
-template <typename T> struct csrmv_impl {
-  void operator()(sycl::queue &queue, oneapi::mkl::transpose trans,
-                  int num_rows, int num_cols, const void *alpha,
-                  const std::shared_ptr<matrix_info> info, const void *val,
-                  const int *row_ptr, const int *col_ind, const void *x,
-                  const void *beta, void *y) {
-#ifndef __INTEL_MKL__
-    throw std::runtime_error(
-        "The oneAPI Math Kernel Library (oneMKL) Interfaces "
-        "Project does not support this API.");
-#else
-    using Ty = typename dpct::DataType<T>::T2;
-    auto alpha_value =
-        dpct::detail::get_value(reinterpret_cast<const Ty *>(alpha), queue);
-    auto beta_value =
-        dpct::detail::get_value(reinterpret_cast<const Ty *>(beta), queue);
-
-    oneapi::mkl::sparse::matrix_handle_t *sparse_matrix_handle =
-        new oneapi::mkl::sparse::matrix_handle_t;
-    oneapi::mkl::sparse::init_matrix_handle(sparse_matrix_handle);
-    auto data_row_ptr = dpct::detail::get_memory<int>(row_ptr);
-    auto data_col_ind = dpct::detail::get_memory<int>(col_ind);
-    auto data_val = dpct::detail::get_memory<Ty>(val);
-    oneapi::mkl::sparse::set_csr_data(queue, *sparse_matrix_handle, num_rows,
-                                      num_cols, info->get_index_base(),
-                                      data_row_ptr, data_col_ind, data_val);
-
-    auto data_x = dpct::detail::get_memory<Ty>(x);
-    auto data_y = dpct::detail::get_memory<Ty>(y);
-    switch (info->get_matrix_type()) {
-    case matrix_info::matrix_type::ge: {
-      oneapi::mkl::sparse::optimize_gemv(queue, trans, *sparse_matrix_handle);
-      oneapi::mkl::sparse::gemv(queue, trans, alpha_value,
-                                *sparse_matrix_handle, data_x, beta_value,
-                                data_y);
-      break;
-    }
-    case matrix_info::matrix_type::sy: {
-      oneapi::mkl::sparse::symv(queue, info->get_uplo(), alpha_value,
-                                *sparse_matrix_handle, data_x, beta_value,
-                                data_y);
-      break;
-    }
-    case matrix_info::matrix_type::tr: {
-      oneapi::mkl::sparse::optimize_trmv(queue, info->get_uplo(), trans,
-                                         info->get_diag(),
-                                         *sparse_matrix_handle);
-      oneapi::mkl::sparse::trmv(
-          queue, info->get_uplo(), trans, info->get_diag(), alpha_value,
-          *sparse_matrix_handle, data_x, beta_value, data_y);
-      break;
-    }
-    default:
-      throw std::runtime_error(
-          "the spmv does not support matrix_info::matrix_type::he");
-    }
-
-    sycl::event e =
-        oneapi::mkl::sparse::release_matrix_handle(queue, sparse_matrix_handle);
-    queue.submit([&](sycl::handler &cgh) {
-      cgh.depends_on(e);
-      cgh.host_task([=] { delete sparse_matrix_handle; });
-    });
-#endif
-  }
-};
-} // namespace detail
-
-/// Computes a CSR format sparse matrix-dense vector product.
-/// y = alpha * op(A) * x + beta * y
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans The operation applied to the matrix A.
-/// \param [in] num_rows Number of rows of the matrix A.
-/// \param [in] num_cols Number of columns of the matrix A.
-/// \param [in] alpha Scaling factor for the matrix A.
-/// \param [in] info Matrix info of the matrix A.
-/// \param [in] val An array containing the non-zero elements of the matrix A.
-/// \param [in] row_ptr An array of length \p num_rows + 1.
-/// \param [in] col_ind An array containing the column indices in index-based
-/// numbering.
-/// \param [in] x Data of the vector x.
-/// \param [in] beta Scaling factor for the vector x.
-/// \param [in, out] y Data of the vector y.
-template <typename T>
-void csrmv(sycl::queue &queue, oneapi::mkl::transpose trans, int num_rows,
-           int num_cols, const T *alpha,
-           const std::shared_ptr<matrix_info> info, const T *val,
-           const int *row_ptr, const int *col_ind, const T *x, const T *beta,
-           T *y) {
-  detail::csrmv_impl<T>()(queue, trans, num_rows, num_cols, alpha, info, val,
-                          row_ptr, col_ind, x, beta, y);
-}
-
-/// Computes a CSR format sparse matrix-dense vector product.
-/// y = alpha * op(A) * x + beta * y
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans The operation applied to the matrix A.
-/// \param [in] num_rows Number of rows of the matrix A.
-/// \param [in] num_cols Number of columns of the matrix A.
-/// \param [in] alpha Scaling factor for the matrix A.
-/// \param [in] alpha_type Data type of \p alpha .
-/// \param [in] info Matrix info of the matrix A.
-/// \param [in] val An array containing the non-zero elements of the matrix A.
-/// \param [in] val_type Data type of \p val .
-/// \param [in] row_ptr An array of length \p num_rows + 1.
-/// \param [in] col_ind An array containing the column indices in index-based
-/// numbering.
-/// \param [in] x Data of the vector x.
-/// \param [in] x_type Data type of \p x .
-/// \param [in] beta Scaling factor for the vector x.
-/// \param [in] beta_type Data type of \p beta .
-/// \param [in, out] y Data of the vector y.
-/// \param [in] y_type Data type of \p y .
-inline void csrmv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                  int num_rows, int num_cols, const void *alpha,
-                  library_data_t alpha_type,
-                  const std::shared_ptr<matrix_info> info, const void *val,
-                  library_data_t val_type, const int *row_ptr,
-                  const int *col_ind, const void *x, library_data_t x_type,
-                  const void *beta, library_data_t beta_type, void *y,
-                  library_data_t y_type) {
-  detail::spblas_shim<detail::csrmv_impl>(val_type, queue, trans, num_rows,
-                                          num_cols, alpha, info, val, row_ptr,
-                                          col_ind, x, beta, y);
-}
-
-/// Computes a CSR format sparse matrix-dense matrix product.
-/// C = alpha * op(A) * op(B) + beta * C
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans_a The operation applied to the matrix A.
-/// \param [in] trans_b The operation applied to the matrix B.
-/// \param [in] sparse_rows Number of rows of the matrix A.
-/// \param [in] dense_cols Number of columns of the matrix B or C.
-/// \param [in] sparse_cols Number of columns of the matrix A.
-/// \param [in] alpha Scaling factor for the matrix A.
-/// \param [in] info Matrix info of the matrix A.
-/// \param [in] val An array containing the non-zero elements of the matrix A.
-/// \param [in] row_ptr An array of length \p num_rows + 1.
-/// \param [in] col_ind An array containing the column indices in index-based
-/// numbering.
-/// \param [in] b Data of the matrix B.
-/// \param [in] ldb Leading dimension of the matrix B.
-/// \param [in] beta Scaling factor for the matrix B.
-/// \param [in, out] c Data of the matrix C.
-/// \param [in] ldc Leading dimension of the matrix C.
-template <typename T>
-void csrmm(sycl::queue &queue, oneapi::mkl::transpose trans_a,
-           oneapi::mkl::transpose trans_b, int sparse_rows, int dense_cols,
-           int sparse_cols, const T *alpha,
-           const std::shared_ptr<matrix_info> info, const T *val,
-           const int *row_ptr, const int *col_ind, const T *b, int ldb,
-           const T *beta, T *c, int ldc) {
-#ifndef __INTEL_MKL__
-  throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                           "Project does not support this API.");
-#else
-  using Ty = typename dpct::DataType<T>::T2;
-  auto alpha_value =
-      dpct::detail::get_value(reinterpret_cast<const Ty *>(alpha), queue);
-  auto beta_value =
-      dpct::detail::get_value(reinterpret_cast<const Ty *>(beta), queue);
-
-  oneapi::mkl::sparse::matrix_handle_t *sparse_matrix_handle =
-      new oneapi::mkl::sparse::matrix_handle_t;
-  oneapi::mkl::sparse::init_matrix_handle(sparse_matrix_handle);
-  auto data_row_ptr = dpct::detail::get_memory<int>(row_ptr);
-  auto data_col_ind = dpct::detail::get_memory<int>(col_ind);
-  auto data_val = dpct::detail::get_memory<Ty>(val);
-  oneapi::mkl::sparse::set_csr_data(queue, *sparse_matrix_handle, sparse_rows,
-                                    sparse_cols, info->get_index_base(),
-                                    data_row_ptr, data_col_ind, data_val);
-
-  auto data_b = dpct::detail::get_memory<Ty>(b);
-  auto data_c = dpct::detail::get_memory<Ty>(c);
-  sycl::event gemm_event;
-  switch (info->get_matrix_type()) {
-  case matrix_info::matrix_type::ge: {
-#ifndef DPCT_USM_LEVEL_NONE
-    gemm_event =
-#endif
-        oneapi::mkl::sparse::gemm(queue, oneapi::mkl::layout::col_major,
-                                  trans_a, trans_b, alpha_value,
-                                  *sparse_matrix_handle, data_b, dense_cols,
-                                  ldb, beta_value, data_c, ldc);
-    break;
-  }
-  default:
-    throw std::runtime_error(
-        "the csrmm does not support matrix_info::matrix_type::sy, "
-        "matrix_info::matrix_type::tr and matrix_info::matrix_type::he");
-  }
-#ifdef DPCT_USM_LEVEL_NONE
-  queue.wait();
-#endif
-  sycl::event e = oneapi::mkl::sparse::release_matrix_handle(
-      queue, sparse_matrix_handle, {gemm_event});
-  queue.submit([&](sycl::handler &cgh) {
-    cgh.depends_on(e);
-    cgh.host_task([=] { delete sparse_matrix_handle; });
-  });
-#endif
-}
-
-/// Computes a CSR format sparse matrix-dense matrix product.
-/// C = alpha * op(A) * B + beta * C
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans The operation applied to the matrix A.
-/// \param [in] sparse_rows Number of rows of the matrix A.
-/// \param [in] dense_cols Number of columns of the matrix op(B) or C.
-/// \param [in] sparse_cols Number of columns of the matrix A.
-/// \param [in] alpha Scaling factor for the matrix A.
-/// \param [in] info Matrix info of the matrix A.
-/// \param [in] val An array containing the non-zero elements of the matrix A.
-/// \param [in] row_ptr An array of length \p num_rows + 1.
-/// \param [in] col_ind An array containing the column indices in index-based
-/// numbering.
-/// \param [in] b Data of the matrix B.
-/// \param [in] ldb Leading dimension of the matrix B.
-/// \param [in] beta Scaling factor for the matrix B.
-/// \param [in, out] c Data of the matrix C.
-/// \param [in] ldc Leading dimension of the matrix C.
-template <typename T>
-void csrmm(sycl::queue &queue, oneapi::mkl::transpose trans, int sparse_rows,
-           int dense_cols, int sparse_cols, const T *alpha,
-           const std::shared_ptr<matrix_info> info, const T *val,
-           const int *row_ptr, const int *col_ind, const T *b, int ldb,
-           const T *beta, T *c, int ldc) {
-  csrmm<T>(queue, trans, oneapi::mkl::transpose::nontrans, sparse_rows,
-           dense_cols, sparse_cols, alpha, info, val, row_ptr, col_ind, b, ldb,
-           beta, c, ldc);
-}
-
-#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
-/// Saving the optimization information for solving a system of linear
-/// equations.
-class optimize_info {
-public:
-  /// Constructor
-  optimize_info() { oneapi::mkl::sparse::init_matrix_handle(&_matrix_handle); }
-  /// Destructor
-  ~optimize_info() {
-    oneapi::mkl::sparse::release_matrix_handle(get_default_queue(),
-                                               &_matrix_handle, _deps)
-        .wait();
-  }
-  /// Add dependency for the destructor.
-  /// \param [in] e The event which the destructor depends on.
-  void add_dependency(sycl::event e) { _deps.push_back(e); }
-  /// Get the internal saved matrix handle.
-  /// \return Returns the matrix handle.
-  oneapi::mkl::sparse::matrix_handle_t get_matrix_handle() const noexcept {
-    return _matrix_handle;
-  }
-
-private:
-  oneapi::mkl::sparse::matrix_handle_t _matrix_handle = nullptr;
-  std::vector<sycl::event> _deps;
-};
-#endif
-
-#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
-namespace detail {
-#ifdef DPCT_USM_LEVEL_NONE
-#define SPARSE_CALL(CALL, HANDLE) CALL;
-#else
-#define SPARSE_CALL(CALL, HANDLE)                                              \
-  sycl::event e = CALL;                                                        \
-  HANDLE->add_dependency(e);
-#endif
-
-template <typename T> struct optimize_csrsv_impl {
-  void operator()(sycl::queue &queue, oneapi::mkl::transpose trans, int row_col,
-                  const std::shared_ptr<matrix_info> info, const void *val,
-                  const int *row_ptr, const int *col_ind,
-                  std::shared_ptr<optimize_info> optimize_info) {
-    using Ty = typename dpct::DataType<T>::T2;
-    auto data_row_ptr = dpct::detail::get_memory<int>(row_ptr);
-    auto data_col_ind = dpct::detail::get_memory<int>(col_ind);
-    auto data_val = dpct::detail::get_memory<Ty>(val);
-    oneapi::mkl::sparse::set_csr_data(queue, optimize_info->get_matrix_handle(),
-                                      row_col, row_col, info->get_index_base(),
-                                      data_row_ptr, data_col_ind, data_val);
-    if (info->get_matrix_type() != matrix_info::matrix_type::tr)
-      throw std::runtime_error("dpct::sparse::optimize_csrsv_impl()(): "
-                               "oneapi::mkl::sparse::optimize_trsv "
-                               "only accept triangular matrix.");
-    SPARSE_CALL(oneapi::mkl::sparse::optimize_trsv(
-                    queue, info->get_uplo(), trans, info->get_diag(),
-                    optimize_info->get_matrix_handle()),
-                optimize_info);
-  }
-};
-template <typename T> struct csrsv_impl {
-  void operator()(sycl::queue &queue, oneapi::mkl::transpose trans, int row_col,
-                  const void *alpha, const std::shared_ptr<matrix_info> info,
-                  const void *val, const int *row_ptr, const int *col_ind,
-                  std::shared_ptr<optimize_info> optimize_info, const void *x,
-                  void *y) {
-    using Ty = typename dpct::DataType<T>::T2;
-    auto alpha_value =
-        dpct::detail::get_value(static_cast<const Ty *>(alpha), queue);
-    Ty *new_x_ptr = nullptr;
-    if (alpha_value != Ty(1.0f)) {
-      new_x_ptr = (Ty *)dpct::dpct_malloc(row_col * sizeof(Ty));
-      dpct::detail::dpct_memcpy(queue, new_x_ptr, x, row_col * sizeof(Ty),
-                                dpct::memcpy_direction::automatic);
-      auto data_new_x = dpct::detail::get_memory<Ty>(new_x_ptr);
-      oneapi::mkl::blas::column_major::scal(queue, row_col, alpha_value,
-                                            data_new_x, 1);
-    } else {
-      new_x_ptr = const_cast<Ty *>(static_cast<const Ty *>(x));
-    }
-    auto data_new_x = dpct::detail::get_memory<Ty>(new_x_ptr);
-    auto data_y = dpct::detail::get_memory<Ty>(y);
-
-    SPARSE_CALL(oneapi::mkl::sparse::trsv(
-                    queue, info->get_uplo(), trans, info->get_diag(),
-                    optimize_info->get_matrix_handle(), data_new_x, data_y),
-                optimize_info);
-    if (alpha_value != Ty(1.0f)) {
-      dpct::async_dpct_free({new_x_ptr},
-                            {
-#ifndef DPCT_USM_LEVEL_NONE
-                                e
-#endif
-                            },
-                            queue);
-    }
-  }
-};
-} // namespace detail
-
-/// Performs internal optimizations for solving a system of linear equations for
-/// a CSR format sparse matrix.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans The operation applied to the sparse matrix.
-/// \param [in] row_col Number of rows of the sparse matrix.
-/// \param [in] info Matrix info of the sparse matrix.
-/// \param [in] val An array containing the non-zero elements of the sparse matrix.
-/// \param [in] row_ptr An array of length \p num_rows + 1.
-/// \param [in] col_ind An array containing the column indices in index-based
-/// numbering.
-/// \param [out] optimize_info The result of the optimizations.
-template <typename T>
-void optimize_csrsv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                    int row_col, const std::shared_ptr<matrix_info> info,
-                    const T *val, const int *row_ptr, const int *col_ind,
-                    std::shared_ptr<optimize_info> optimize_info) {
-  detail::optimize_csrsv_impl<T>()(queue, trans, row_col, info, val, row_ptr,
-                                   col_ind, optimize_info);
-}
-
-inline void optimize_csrsv(sycl::queue &queue, oneapi::mkl::transpose trans,
-                           int row_col, const std::shared_ptr<matrix_info> info,
-                           const void *val, library_data_t val_type,
-                           const int *row_ptr, const int *col_ind,
-                           std::shared_ptr<optimize_info> optimize_info) {
-  detail::spblas_shim<detail::optimize_csrsv_impl>(
-      val_type, queue, trans, row_col, info, val, row_ptr, col_ind,
-      optimize_info);
-}
-
-template <typename T>
-void csrsv(sycl::queue &queue, oneapi::mkl::transpose trans, int row_col,
-           const T *alpha, const std::shared_ptr<matrix_info> info,
-           const T *val, const int *row_ptr, const int *col_ind,
-           std::shared_ptr<optimize_info> optimize_info, const T *x, T *y) {
-  detail::csrsv_impl<T>()(queue, trans, row_col, alpha, info, val, row_ptr,
-                          col_ind, optimize_info, x, y);
-}
-
-inline void csrsv(sycl::queue &queue, oneapi::mkl::transpose trans, int row_col,
-                  const void *alpha, library_data_t alpha_type,
-                  const std::shared_ptr<matrix_info> info, const void *val,
-                  library_data_t val_type, const int *row_ptr,
-                  const int *col_ind,
-                  std::shared_ptr<optimize_info> optimize_info, const void *x,
-                  library_data_t x_type, void *y, library_data_t y_type) {
-  detail::spblas_shim<detail::csrsv_impl>(val_type, queue, trans, row_col,
-                                          alpha, info, val, row_ptr, col_ind,
-                                          optimize_info, x, y);
-}
-#endif
-
-class sparse_matrix_desc;
-
-using sparse_matrix_desc_t = std::shared_ptr<sparse_matrix_desc>;
-
-/// Structure for describe a dense vector
-class dense_vector_desc {
-public:
-  dense_vector_desc(std::int64_t ele_num, void *value,
-                    library_data_t value_type)
-      : _ele_num(ele_num), _value(value), _value_type(value_type) {}
-  void get_desc(std::int64_t *ele_num, const void **value,
-                library_data_t *value_type) const noexcept {
-    *ele_num = _ele_num;
-    *value = _value;
-    *value_type = _value_type;
-  }
-  void get_desc(std::int64_t *ele_num, void **value,
-                library_data_t *value_type) const noexcept {
-    get_desc(ele_num, const_cast<const void **>(value), value_type);
-  }
-  void *get_value() const noexcept { return _value; }
-  void set_value(void *value) { _value = value; }
-  library_data_t get_value_type() const noexcept { return _value_type; }
-  std::int64_t get_ele_num() const noexcept { return _ele_num; }
-
-private:
-  std::int64_t _ele_num;
-  void *_value;
-  library_data_t _value_type;
-};
-
-/// Structure for describe a dense matrix
-class dense_matrix_desc {
-public:
-  dense_matrix_desc(std::int64_t row_num, std::int64_t col_num,
-                    std::int64_t leading_dim, void *value,
-                    library_data_t value_type, oneapi::mkl::layout layout)
-      : _row_num(row_num), _col_num(col_num), _leading_dim(leading_dim),
-        _value(value), _value_type(value_type), _layout(layout) {}
-  void get_desc(std::int64_t *row_num, std::int64_t *col_num,
-                std::int64_t *leading_dim, void **value,
-                library_data_t *value_type,
-                oneapi::mkl::layout *layout) const noexcept {
-    *row_num = _row_num;
-    *col_num = _col_num;
-    *leading_dim = _leading_dim;
-    *value = _value;
-    *value_type = _value_type;
-    *layout = _layout;
-  }
-  void *get_value() const noexcept { return _value; }
-  void set_value(void *value) { _value = value; }
-  std::int64_t get_col_num() const noexcept { return _col_num; }
-  std::int64_t get_leading_dim() const noexcept { return _leading_dim; }
-  oneapi::mkl::layout get_layout() const noexcept { return _layout; }
-
-private:
-  std::int64_t _row_num;
-  std::int64_t _col_num;
-  std::int64_t _leading_dim;
-  void *_value;
-  library_data_t _value_type;
-  oneapi::mkl::layout _layout;
-};
-
-/// Sparse matrix data format
-enum matrix_format : int {
-  csr = 1,
-};
-
-/// Sparse matrix attribute
-enum matrix_attribute : int { uplo = 0, diag };
-
-#ifdef __INTEL_MKL__ // The oneMKL Interfaces Project does not support this.
-/// Structure for describe a sparse matrix
-class sparse_matrix_desc {
-public:
-  /// Constructor
-  /// \param [out] desc The descriptor to be created
-  /// \param [in] row_num Number of rows of the sparse matrix.
-  /// \param [in] col_num Number of colums of the sparse matrix.
-  /// \param [in] nnz Non-zero elements in the sparse matrix.
-  /// \param [in] row_ptr An array of length \p row_num + 1. If the \p row_ptr is
-  /// NULL, the sparse_matrix_desc will allocate internal memory for it. This
-  /// internal memory can be gotten from get_shadow_row_ptr().
-  /// \param [in] col_ind An array containing the column indices in index-based
-  /// numbering.
-  /// \param [in] value An array containing the non-zero elements of the sparse matrix.
-  /// \param [in] row_ptr_type Data type of the \p row_ptr .
-  /// \param [in] col_ind_type Data type of the \p col_ind .
-  /// \param [in] base Indicates how input arrays are indexed.
-  /// \param [in] value_type Data type of the \p value .
-  /// \param [in] data_format The matrix data format.
-  sparse_matrix_desc(std::int64_t row_num, std::int64_t col_num,
-                     std::int64_t nnz, void *row_ptr, void *col_ind,
-                     void *value, library_data_t row_ptr_type,
-                     library_data_t col_ind_type, oneapi::mkl::index_base base,
-                     library_data_t value_type, matrix_format data_format)
-      : _row_num(row_num), _col_num(col_num), _nnz(nnz), _row_ptr(row_ptr),
-        _col_ind(col_ind), _value(value), _row_ptr_type(row_ptr_type),
-        _col_ind_type(col_ind_type), _base(base), _value_type(value_type),
-        _data_format(data_format) {
-    if (_data_format != matrix_format::csr) {
-      throw std::runtime_error("the sparse matrix data format is unsupported");
-    }
-    oneapi::mkl::sparse::init_matrix_handle(&_matrix_handle);
-    set_data();
-  }
-  /// Destructor
-  ~sparse_matrix_desc() {
-    oneapi::mkl::sparse::release_matrix_handle(get_default_queue(),
-                                               &_matrix_handle, _deps)
-        .wait();
-  }
-
-  /// Add dependency for the destroy method.
-  /// \param [in] e The event which the destroy method depends on.
-  void add_dependency(sycl::event e) { _deps.push_back(e); }
-  /// Get the internal saved matrix handle.
-  /// \return Returns the matrix handle.
-  oneapi::mkl::sparse::matrix_handle_t get_matrix_handle() const noexcept {
-    return _matrix_handle;
-  }
-  /// Get the values saved in the descriptor
-  /// \param [out] row_num Number of rows of the sparse matrix.
-  /// \param [out] col_num Number of colums of the sparse matrix.
-  /// \param [out] nnz Non-zero elements in the sparse matrix.
-  /// \param [out] row_ptr An array of length \p row_num + 1.
-  /// \param [out] col_ind An array containing the column indices in index-based
-  /// numbering.
-  /// \param [out] value An array containing the non-zero elements of the sparse matrix.
-  /// \param [out] row_ptr_type Data type of the \p row_ptr .
-  /// \param [out] col_ind_type Data type of the \p col_ind .
-  /// \param [out] base Indicates how input arrays are indexed.
-  /// \param [out] value_type Data type of the \p value .
-  void get_desc(int64_t *row_num, int64_t *col_num, int64_t *nnz,
-                void **row_ptr, void **col_ind, void **value,
-                library_data_t *row_ptr_type, library_data_t *col_ind_type,
-                oneapi::mkl::index_base *base,
-                library_data_t *value_type) const noexcept {
-    *row_num = _row_num;
-    *col_num = _col_num;
-    *nnz = _nnz;
-    *row_ptr = _row_ptr;
-    *col_ind = _col_ind;
-    *value = _value;
-    *row_ptr_type = _row_ptr_type;
-    *col_ind_type = _col_ind_type;
-    *base = _base;
-    *value_type = _value_type;
-  }
-  /// Get the sparse matrix data format of this descriptor
-  /// \param [out] format The matrix data format result
-  void get_format(matrix_format *data_format) const noexcept {
-    *data_format = _data_format;
-  }
-  /// Get the index base of this descriptor
-  /// \param [out] base The index base result
-  void get_base(oneapi::mkl::index_base *base) const noexcept { *base = _base; }
-  /// Get the value pointer of this descriptor
-  /// \param [out] value The value pointer result
-  void get_value(void **value) const noexcept { *value = _value; }
-  /// Set the value pointer of this descriptor
-  /// \param [in] value The input value pointer
-  void set_value(void *value) {
-    if (!value) {
-      throw std::runtime_error(
-          "dpct::sparse::sparse_matrix_desc::set_value(): The value "
-          "pointer is NULL.");
-    }
-    if (_value && (_value != value)) {
-      throw std::runtime_error(
-          "dpct::sparse::sparse_matrix_desc::set_value(): "
-          "The _value pointer is not NULL. It cannot be reset.");
-    }
-    _value = value;
-    set_data();
-  }
-  /// Get the size of the sparse matrix
-  /// \param [out] row_num Number of rows of the sparse matrix.
-  /// \param [out] col_num Number of colums of the sparse matrix.
-  /// \param [out] nnz Non-zero elements in the sparse matrix.
-  void get_size(int64_t *row_num, int64_t *col_num,
-                int64_t *nnz) const noexcept {
-    *row_num = _row_num;
-    *col_num = _col_num;
-    *nnz = _nnz;
-  }
-  /// Set the sparse matrix attribute
-  /// \param [in] attribute The attribute type
-  /// \param [in] data The attribute value
-  /// \param [in] data_size The data size of the attribute value
-  void set_attribute(matrix_attribute attribute, const void *data,
-                     size_t data_size) {
-    if (attribute == matrix_attribute::diag) {
-      const oneapi::mkl::diag *diag_ptr =
-          reinterpret_cast<const oneapi::mkl::diag *>(data);
-      if (*diag_ptr == oneapi::mkl::diag::unit) {
-        _diag = oneapi::mkl::diag::unit;
-      } else if (*diag_ptr == oneapi::mkl::diag::nonunit) {
-        _diag = oneapi::mkl::diag::nonunit;
-      } else {
-        throw std::runtime_error("unsupported diag value");
-      }
-    } else if (attribute == matrix_attribute::uplo) {
-      const oneapi::mkl::uplo *uplo_ptr =
-          reinterpret_cast<const oneapi::mkl::uplo *>(data);
-      if (*uplo_ptr == oneapi::mkl::uplo::upper) {
-        _uplo = oneapi::mkl::uplo::upper;
-      } else if (*uplo_ptr == oneapi::mkl::uplo::lower) {
-        _uplo = oneapi::mkl::uplo::lower;
-      } else {
-        throw std::runtime_error("unsupported uplo value");
-      }
-    } else {
-      throw std::runtime_error("unsupported attribute");
-    }
-  }
-  /// Get the sparse matrix attribute
-  /// \param [out] attribute The attribute type
-  /// \param [out] data The attribute value
-  /// \param [out] data_size The data size of the attribute value
-  void get_attribute(matrix_attribute attribute, void *data,
-                     size_t data_size) const {
-    if (attribute == matrix_attribute::diag) {
-      oneapi::mkl::diag *diag_ptr = reinterpret_cast<oneapi::mkl::diag *>(data);
-      if (_diag.has_value()) {
-        *diag_ptr = _diag.value();
-      } else {
-        *diag_ptr = oneapi::mkl::diag::nonunit;
-      }
-    } else if (attribute == matrix_attribute::uplo) {
-      oneapi::mkl::uplo *uplo_ptr = reinterpret_cast<oneapi::mkl::uplo *>(data);
-      if (_uplo.has_value()) {
-        *uplo_ptr = _uplo.value();
-      } else {
-        *uplo_ptr = oneapi::mkl::uplo::lower;
-      }
-    } else {
-      throw std::runtime_error("unsupported attribute");
-    }
-  }
-  /// Set the pointers for describing the sparse matrix
-  /// \param [in] row_ptr An array of length \p row_num + 1.
-  /// \param [in] col_ind An array containing the column indices in index-based
-  /// numbering.
-  /// \param [in] value An array containing the non-zero elements of the sparse matrix.
-  void set_pointers(void *row_ptr, void *col_ind, void *value) {
-    if (!row_ptr) {
-      throw std::runtime_error(
-          "dpct::sparse::sparse_matrix_desc::set_pointers(): The "
-          "row_ptr pointer is NULL.");
-    }
-    if (!col_ind) {
-      throw std::runtime_error(
-          "dpct::sparse::sparse_matrix_desc::set_pointers(): The "
-          "col_ind pointer is NULL.");
-    }
-    if (_row_ptr && (_row_ptr != row_ptr)) {
-      throw std::runtime_error("dpct::sparse::sparse_matrix_desc::set_pointers("
-                               "): The _row_ptr pointer is "
-                               "not NULL. It cannot be reset.");
-    }
-    if (_col_ind && (_col_ind != col_ind)) {
-      throw std::runtime_error("dpct::sparse::sparse_matrix_desc::set_pointers("
-                               "): The _col_ind pointer is "
-                               "not NULL. It cannot be reset.");
-    }
-    _row_ptr = row_ptr;
-    _col_ind = col_ind;
-
-    // The descriptor will be updated in the set_value function
-    set_value(value);
-  }
-
-  /// Get the diag attribute
-  /// \return diag value
-  std::optional<oneapi::mkl::diag> get_diag() const noexcept { return _diag; }
-  /// Get the uplo attribute
-  /// \return uplo value
-  std::optional<oneapi::mkl::uplo> get_uplo() const noexcept { return _uplo; }
-  /// Set the number of non-zero elements
-  /// \param nnz [in] The number of non-zero elements.
-  void set_nnz(std::int64_t nnz) noexcept { _nnz = nnz; }
-  /// Get the type of the value pointer.
-  /// \return The type of the value pointer.
-  library_data_t get_value_type() const noexcept { return _value_type; }
-  /// Get the row_ptr.
-  /// \return The row_ptr.
-  void *get_row_ptr() const noexcept { return _row_ptr; }
-  /// If the internal _row_ptr is NULL, the sparse_matrix_desc will allocate
-  /// internal memory for it in the constructor. The internal memory can be gotten
-  /// from this interface.
-  /// \return The shadow row_ptr.
-  void *get_shadow_row_ptr() const noexcept { return _shadow_row_ptr.get(); }
-  /// Get the type of the col_ind pointer.
-  /// \return The type of the col_ind pointer.
-  library_data_t get_col_ind_type() const noexcept { return _col_ind_type; }
-  /// Get the row_num.
-  /// \return The row_num.
-  std::int64_t get_row_num() const noexcept { return _row_num; }
-
-private:
-  inline static const std::function<void(void *)> _shadow_row_ptr_deleter =
-      [](void *ptr) { dpct::dpct_free(ptr); };
-  template <typename index_t, typename value_t> void set_data() {
-    void *row_ptr = nullptr;
-    if (_shadow_row_ptr) {
-      row_ptr = _shadow_row_ptr.get();
-    } else if (_row_ptr) {
-      row_ptr = _row_ptr;
-    } else {
-      row_ptr = dpct::dpct_malloc(sizeof(index_t) * (_row_num + 1),
-                                  get_default_queue());
-      _shadow_row_ptr.reset(row_ptr);
-    }
-#ifdef DPCT_USM_LEVEL_NONE
-    using data_index_t = sycl::buffer<index_t>;
-    using data_value_t = sycl::buffer<value_t>;
-#else
-    using data_index_t = index_t *;
-    using data_value_t = value_t *;
-#endif
-    _data_row_ptr = dpct::detail::get_memory<index_t>(row_ptr);
-    _data_col_ind = dpct::detail::get_memory<index_t>(_col_ind);
-    _data_value = dpct::detail::get_memory<value_t>(_value);
-    oneapi::mkl::sparse::set_csr_data(get_default_queue(), _matrix_handle,
-                                      _row_num, _col_num, _base,
-                                      std::get<data_index_t>(_data_row_ptr),
-                                      std::get<data_index_t>(_data_col_ind),
-                                      std::get<data_value_t>(_data_value));
-    get_default_queue().wait();
-  }
-
-  void set_data() {
-    std::uint64_t key = dpct::detail::get_type_combination_id(
-        _row_ptr_type, _col_ind_type, _value_type);
-    switch (key) {
-    case dpct::detail::get_type_combination_id(library_data_t::real_int32,
-                                               library_data_t::real_int32,
-                                               library_data_t::real_float): {
-      set_data<std::int32_t, float>();
-      break;
-    }
-    case dpct::detail::get_type_combination_id(library_data_t::real_int32,
-                                               library_data_t::real_int32,
-                                               library_data_t::real_double): {
-      set_data<std::int32_t, double>();
-      break;
-    }
-    case dpct::detail::get_type_combination_id(library_data_t::real_int32,
-                                               library_data_t::real_int32,
-                                               library_data_t::complex_float): {
-      set_data<std::int32_t, std::complex<float>>();
-      break;
-    }
-    case dpct::detail::get_type_combination_id(
-        library_data_t::real_int32, library_data_t::real_int32,
-        library_data_t::complex_double): {
-      set_data<std::int32_t, std::complex<double>>();
-      break;
-    }
-    case dpct::detail::get_type_combination_id(library_data_t::real_int64,
-                                               library_data_t::real_int64,
-                                               library_data_t::real_float): {
-      set_data<std::int64_t, float>();
-      break;
-    }
-    case dpct::detail::get_type_combination_id(library_data_t::real_int64,
-                                               library_data_t::real_int64,
-                                               library_data_t::real_double): {
-      set_data<std::int64_t, double>();
-      break;
-    }
-    case dpct::detail::get_type_combination_id(library_data_t::real_int64,
-                                               library_data_t::real_int64,
-                                               library_data_t::complex_float): {
-      set_data<std::int64_t, std::complex<float>>();
-      break;
-    }
-    case dpct::detail::get_type_combination_id(
-        library_data_t::real_int64, library_data_t::real_int64,
-        library_data_t::complex_double): {
-      set_data<std::int64_t, std::complex<double>>();
-      break;
-    }
-    default:
-      throw std::runtime_error("the combination of data type is unsupported");
-    }
-  }
-
-  std::int64_t _row_num;
-  std::int64_t _col_num;
-  std::int64_t _nnz;
-  void *_row_ptr;
-  void *_col_ind;
-  void *_value;
-  library_data_t _row_ptr_type;
-  library_data_t _col_ind_type;
-  oneapi::mkl::index_base _base;
-  library_data_t _value_type;
-  oneapi::mkl::sparse::matrix_handle_t _matrix_handle = nullptr;
-  std::vector<sycl::event> _deps;
-  matrix_format _data_format;
-  std::optional<oneapi::mkl::uplo> _uplo;
-  std::optional<oneapi::mkl::diag> _diag;
-  std::unique_ptr<void, std::function<void(void *)>> _shadow_row_ptr =
-      std::unique_ptr<void, std::function<void(void *)>>(
-          nullptr, _shadow_row_ptr_deleter);
-
-  static constexpr size_t _max_data_variable_size = std::max(
-      {sizeof(sycl::buffer<std::int32_t>), sizeof(sycl::buffer<std::int64_t>),
-       sizeof(sycl::buffer<float>), sizeof(sycl::buffer<double>),
-       sizeof(sycl::buffer<std::complex<float>>),
-       sizeof(sycl::buffer<std::complex<double>>), sizeof(void *)});
-  using index_variant_t =
-      std::variant<std::array<std::byte, _max_data_variable_size>,
-                   sycl::buffer<std::int32_t>, sycl::buffer<std::int64_t>,
-                   std::int32_t *, std::int64_t *>;
-  using value_variant_t =
-      std::variant<std::array<std::byte, _max_data_variable_size>,
-                   sycl::buffer<float>, sycl::buffer<double>,
-                   sycl::buffer<std::complex<float>>,
-                   sycl::buffer<std::complex<double>>, float *, double *,
-                   std::complex<float> *, std::complex<double> *>;
-  index_variant_t _data_row_ptr;
-  index_variant_t _data_col_ind;
-  value_variant_t _data_value;
-};
-
-namespace detail {
-template <typename T> struct spmv_impl {
-  void operator()(sycl::queue queue, oneapi::mkl::transpose trans,
-                  const void *alpha, sparse_matrix_desc_t a,
-                  std::shared_ptr<dense_vector_desc> x, const void *beta,
-                  std::shared_ptr<dense_vector_desc> y) {
-    auto alpha_value =
-        dpct::detail::get_value(reinterpret_cast<const T *>(alpha), queue);
-    auto beta_value =
-        dpct::detail::get_value(reinterpret_cast<const T *>(beta), queue);
-    auto data_x = dpct::detail::get_memory<T>(x->get_value());
-    auto data_y = dpct::detail::get_memory<T>(y->get_value());
-    if (a->get_diag().has_value() && a->get_uplo().has_value()) {
-      oneapi::mkl::sparse::optimize_trmv(queue, a->get_uplo().value(), trans,
-                                         a->get_diag().value(),
-                                         a->get_matrix_handle());
-      SPARSE_CALL(oneapi::mkl::sparse::trmv(queue, a->get_uplo().value(), trans,
-                                            a->get_diag().value(), alpha_value,
-                                            a->get_matrix_handle(), data_x,
-                                            beta_value, data_y),
-                  a);
-    } else {
-      oneapi::mkl::sparse::optimize_gemv(queue, trans, a->get_matrix_handle());
-      SPARSE_CALL(oneapi::mkl::sparse::gemv(queue, trans, alpha_value,
-                                            a->get_matrix_handle(), data_x,
-                                            beta_value, data_y),
-                  a);
-    }
-  }
-};
-
-template <typename T> struct spmm_impl {
-  void operator()(sycl::queue queue, oneapi::mkl::transpose trans_a,
-                  oneapi::mkl::transpose trans_b, const void *alpha,
-                  sparse_matrix_desc_t a, std::shared_ptr<dense_matrix_desc> b,
-                  const void *beta, std::shared_ptr<dense_matrix_desc> c) {
-    auto alpha_value =
-        dpct::detail::get_value(reinterpret_cast<const T *>(alpha), queue);
-    auto beta_value =
-        dpct::detail::get_value(reinterpret_cast<const T *>(beta), queue);
-    auto data_b = dpct::detail::get_memory<T>(b->get_value());
-    auto data_c = dpct::detail::get_memory<T>(c->get_value());
-    SPARSE_CALL(
-        oneapi::mkl::sparse::gemm(queue, b->get_layout(), trans_a, trans_b,
-                                  alpha_value, a->get_matrix_handle(), data_b,
-                                  b->get_col_num(), b->get_leading_dim(),
-                                  beta_value, data_c, c->get_leading_dim()),
-        a);
-  }
-};
-#undef SPARSE_CALL
-} // namespace detail
-
-/// Computes a sparse matrix-dense vector product: y = alpha * op(a) * x + beta * y.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans Specifies operation on input matrix.
-/// \param [in] alpha Specifies the scalar alpha.
-/// \param [in] a Specifies the sparse matrix a.
-/// \param [in] x Specifies the dense vector x.
-/// \param [in] beta Specifies the scalar beta.
-/// \param [in, out] y Specifies the dense vector y.
-/// \param [in] data_type Specifies the data type of \param a, \param x and \param y .
-inline void spmv(sycl::queue queue, oneapi::mkl::transpose trans,
-                 const void *alpha, sparse_matrix_desc_t a,
-                 std::shared_ptr<dense_vector_desc> x, const void *beta,
-                 std::shared_ptr<dense_vector_desc> y,
-                 library_data_t data_type) {
-  detail::spblas_shim<detail::spmv_impl>(data_type, queue, trans, alpha, a, x,
-                                         beta, y);
-}
-
-/// Computes a sparse matrix-dense matrix product: c = alpha * op(a) * op(b) + beta * c.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans_a Specifies operation on input matrix a.
-/// \param [in] trans_b Specifies operation on input matrix b.
-/// \param [in] alpha Specifies the scalar alpha.
-/// \param [in] a Specifies the sparse matrix a.
-/// \param [in] b Specifies the dense matrix b.
-/// \param [in] beta Specifies the scalar beta.
-/// \param [in, out] c Specifies the dense matrix c.
-/// \param [in] data_type Specifies the data type of \param a, \param b and \param c .
-inline void spmm(sycl::queue queue, oneapi::mkl::transpose trans_a,
-                 oneapi::mkl::transpose trans_b, const void *alpha,
-                 sparse_matrix_desc_t a, std::shared_ptr<dense_matrix_desc> b,
-                 const void *beta, std::shared_ptr<dense_matrix_desc> c,
-                 library_data_t data_type) {
-  if (b->get_layout() != c->get_layout())
-    throw std::runtime_error("the layout of b and c are different");
-  detail::spblas_shim<detail::spmm_impl>(data_type, queue, trans_a, trans_b,
-                                         alpha, a, b, beta, c);
-}
-
-namespace detail {
-template <typename T, bool is_host_memory, typename host_memory_t = void>
-struct temp_memory {
-  static_assert(!is_host_memory || !std::is_same_v<host_memory_t, void>,
-                "host_memory_t cannot be void when the input parameter ptr "
-                "points to host memory");
-  temp_memory(sycl::queue queue, void *ptr)
-      : _queue(queue)
-#ifdef DPCT_USM_LEVEL_NONE
-        ,
-        _buffer(is_host_memory ? sycl::buffer<T, 1>(sycl::range<1>(1))
-                               : sycl::buffer<T, 1>(dpct::get_buffer<T>(ptr)))
-#endif
-  {
-    if constexpr (is_host_memory) {
-      _original_host_ptr = static_cast<host_memory_t *>(ptr);
-#ifdef DPCT_USM_LEVEL_NONE
-      auto _buffer_acc = _buffer.get_host_access(sycl::write_only);
-      _buffer_acc[0] = static_cast<T>(*_original_host_ptr);
-#else
-      _memory_ptr = sycl::malloc_host<T>(1, _queue);
-      *_memory_ptr = static_cast<T>(*_original_host_ptr);
-#endif
-    } else {
-#ifndef DPCT_USM_LEVEL_NONE
-      _memory_ptr = static_cast<T *>(ptr);
-#endif
-    }
-  }
-
-  ~temp_memory() {
-    if constexpr (is_host_memory) {
-#ifdef DPCT_USM_LEVEL_NONE
-      auto _buffer_acc = _buffer.get_host_access(sycl::read_only);
-      *_original_host_ptr = static_cast<host_memory_t>(_buffer_acc[0]);
-#else
-      _queue.wait();
-      *_original_host_ptr = *_memory_ptr;
-      sycl::free(_memory_ptr, _queue);
-#endif
-    }
-  }
-  auto get_memory_ptr() {
-#ifdef DPCT_USM_LEVEL_NONE
-    return &_buffer;
-#else
-    return _memory_ptr;
-#endif
-  }
-
-private:
-  sycl::queue _queue;
-  host_memory_t *_original_host_ptr = nullptr;
-#ifdef DPCT_USM_LEVEL_NONE
-  sycl::buffer<T, 1> _buffer;
-#else
-  T *_memory_ptr;
-#endif
-};
-} // namespace detail
-
-/// Do initial estimation of work and load balancing of computing a sparse
-/// matrix-sparse matrix product.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans_a Specifies operation on input matrix a.
-/// \param [in] trans_b Specifies operation on input matrix b.
-/// \param [in] alpha Specifies the scalar alpha.
-/// \param [in] a Specifies the sparse matrix a.
-/// \param [in] b Specifies the sparse matrix b.
-/// \param [in] beta Specifies the scalar beta.
-/// \param [in, out] c Specifies the sparse matrix c.
-/// \param [in] matmat_descr Describes the sparse matrix-sparse matrix operation
-/// to be executed.
-/// \param [in, out] size_temp_buffer Specifies the size of workspace.
-/// \param [in] temp_buffer Specifies the memory of the workspace.
-inline void
-spgemm_work_estimation(sycl::queue queue, oneapi::mkl::transpose trans_a,
-                       oneapi::mkl::transpose trans_b, const void *alpha,
-                       sparse_matrix_desc_t a, sparse_matrix_desc_t b,
-                       const void *beta, sparse_matrix_desc_t c,
-                       oneapi::mkl::sparse::matmat_descr_t matmat_descr,
-                       size_t *size_temp_buffer, void *temp_buffer) {
-  if (temp_buffer) {
-    detail::temp_memory<std::int64_t, true, size_t> size_memory(
-        queue, size_temp_buffer);
-    detail::temp_memory<std::uint8_t, false> work_memory(queue, temp_buffer);
-    oneapi::mkl::sparse::matmat(
-        queue, a->get_matrix_handle(), b->get_matrix_handle(),
-        c->get_matrix_handle(),
-        oneapi::mkl::sparse::matmat_request::work_estimation, matmat_descr,
-        size_memory.get_memory_ptr(), work_memory.get_memory_ptr()
-#ifndef DPCT_USM_LEVEL_NONE
-        , {}
-#endif
-    );
-  } else {
-    oneapi::mkl::sparse::set_matmat_data(
-        matmat_descr, oneapi::mkl::sparse::matrix_view_descr::general, trans_a,
-        oneapi::mkl::sparse::matrix_view_descr::general, trans_b,
-        oneapi::mkl::sparse::matrix_view_descr::general);
-    detail::temp_memory<std::int64_t, true, size_t> size_memory(
-        queue, size_temp_buffer);
-    oneapi::mkl::sparse::matmat(
-        queue, a->get_matrix_handle(), b->get_matrix_handle(),
-        c->get_matrix_handle(),
-        oneapi::mkl::sparse::matmat_request::get_work_estimation_buf_size,
-        matmat_descr, size_memory.get_memory_ptr(), nullptr
-#ifndef DPCT_USM_LEVEL_NONE
-        , {}
-#endif
-    );
-  }
-}
-
-/// Do internal products for computing the C matrix of computing a sparse
-/// matrix-sparse matrix product.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans_a Specifies operation on input matrix a.
-/// \param [in] trans_b Specifies operation on input matrix b.
-/// \param [in] alpha Specifies the scalar alpha.
-/// \param [in] a Specifies the sparse matrix a.
-/// \param [in] b Specifies the sparse matrix b.
-/// \param [in] beta Specifies the scalar beta.
-/// \param [in, out] c Specifies the sparse matrix c.
-/// \param [in] matmat_descr Describes the sparse matrix-sparse matrix operation
-/// to be executed.
-/// \param [in, out] size_temp_buffer Specifies the size of workspace.
-/// \param [in] temp_buffer Specifies the memory of the workspace.
-inline void spgemm_compute(sycl::queue queue, oneapi::mkl::transpose trans_a,
-                           oneapi::mkl::transpose trans_b, const void *alpha,
-                           sparse_matrix_desc_t a, sparse_matrix_desc_t b,
-                           const void *beta, sparse_matrix_desc_t c,
-                           oneapi::mkl::sparse::matmat_descr_t matmat_descr,
-                           size_t *size_temp_buffer, void *temp_buffer) {
-  if (temp_buffer) {
-    std::int64_t nnz_value = 0;
-    {
-      detail::temp_memory<std::int64_t, true, size_t> size_memory(
-          queue, size_temp_buffer);
-      detail::temp_memory<std::uint8_t, false> work_memory(queue, temp_buffer);
-      detail::temp_memory<std::int64_t, true, std::int64_t> nnz_memory(
-          queue, &nnz_value);
-      oneapi::mkl::sparse::matmat(
-          queue, a->get_matrix_handle(), b->get_matrix_handle(),
-          c->get_matrix_handle(), oneapi::mkl::sparse::matmat_request::compute,
-          matmat_descr, size_memory.get_memory_ptr(),
-          work_memory.get_memory_ptr()
-#ifndef DPCT_USM_LEVEL_NONE
-          , {}
-#endif
-      );
-      oneapi::mkl::sparse::matmat(
-          queue, a->get_matrix_handle(), b->get_matrix_handle(),
-          c->get_matrix_handle(), oneapi::mkl::sparse::matmat_request::get_nnz,
-          matmat_descr, nnz_memory.get_memory_ptr(), nullptr
-#ifndef DPCT_USM_LEVEL_NONE
-          , {}
-#endif
-      );
-    }
-    c->set_nnz(nnz_value);
-  } else {
-    detail::temp_memory<std::int64_t, true, size_t> size_memory(
-        queue, size_temp_buffer);
-    oneapi::mkl::sparse::matmat(
-        queue, a->get_matrix_handle(), b->get_matrix_handle(),
-        c->get_matrix_handle(),
-        oneapi::mkl::sparse::matmat_request::get_compute_buf_size, matmat_descr,
-        size_memory.get_memory_ptr(), nullptr
-#ifndef DPCT_USM_LEVEL_NONE
-        , {}
-#endif
-    );
-  }
-}
-
-/// Do any remaining internal products and accumulation and transfer into final
-/// C matrix arrays of computing a sparse matrix-sparse matrix product.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans_a Specifies operation on input matrix a.
-/// \param [in] trans_b Specifies operation on input matrix b.
-/// \param [in] alpha Specifies the scalar alpha.
-/// \param [in] a Specifies the sparse matrix a.
-/// \param [in] b Specifies the sparse matrix b.
-/// \param [in] beta Specifies the scalar beta.
-/// \param [in, out] c Specifies the sparse matrix c.
-/// \param [in] matmat_descr Describes the sparse matrix-sparse matrix operation
-/// to be executed.
-inline void spgemm_finalize(sycl::queue queue, oneapi::mkl::transpose trans_a,
-                            oneapi::mkl::transpose trans_b, const void *alpha,
-                            sparse_matrix_desc_t a, sparse_matrix_desc_t b,
-                            const void *beta, sparse_matrix_desc_t c,
-                            oneapi::mkl::sparse::matmat_descr_t matmat_descr) {
-  oneapi::mkl::sparse::matmat(queue, a->get_matrix_handle(),
-                              b->get_matrix_handle(), c->get_matrix_handle(),
-                              oneapi::mkl::sparse::matmat_request::finalize,
-                              matmat_descr, nullptr, nullptr
-#ifdef DPCT_USM_LEVEL_NONE
-  );
-#else
-  , {}).wait();
-#endif
-  if (c->get_shadow_row_ptr()) {
-    switch (c->get_col_ind_type()) {
-    case library_data_t::real_int32: {
-      dpct::dpct_memcpy(c->get_row_ptr(), c->get_shadow_row_ptr(),
-                        sizeof(std::int32_t) * (c->get_row_num() + 1));
-      break;
-    }
-    case library_data_t::real_int64: {
-      dpct::dpct_memcpy(c->get_row_ptr(), c->get_shadow_row_ptr(),
-                        sizeof(std::int64_t) * (c->get_row_num() + 1));
-      break;
-    }
-    default:
-      throw std::runtime_error("dpct::sparse::spgemm_finalize(): The data type "
-                               "of the col_ind in matrix c is unsupported.");
-    }
-  }
-}
-
-namespace detail {
-template <typename T> struct spsv_impl {
-  void operator()(sycl::queue queue, oneapi::mkl::uplo uplo,
-                  oneapi::mkl::diag diag, oneapi::mkl::transpose trans_a,
-                  const void *alpha, sparse_matrix_desc_t a,
-                  std::shared_ptr<dense_vector_desc> x,
-                  std::shared_ptr<dense_vector_desc> y) {
-    auto alpha_value =
-        dpct::detail::get_value(reinterpret_cast<const T *>(alpha), queue);
-    T *new_x_ptr = nullptr;
-    if (alpha_value != T(1.0f)) {
-      new_x_ptr = (T *)dpct::dpct_malloc(x->get_ele_num() * sizeof(T));
-      dpct::dpct_memcpy(new_x_ptr, x->get_value(),
-                        x->get_ele_num() * sizeof(T));
-      auto data_new_x = dpct::detail::get_memory<T>(new_x_ptr);
-      oneapi::mkl::blas::column_major::scal(queue, x->get_ele_num(),
-                                            alpha_value, data_new_x, 1);
-    } else {
-      new_x_ptr = static_cast<T *>(x->get_value());
-    }
-    auto data_new_x = dpct::detail::get_memory<T>(new_x_ptr);
-    auto data_y = dpct::detail::get_memory<T>(y->get_value());
-    oneapi::mkl::sparse::trsv(queue, uplo, trans_a, diag,
-                              a->get_matrix_handle(), data_new_x, data_y);
-    if (alpha_value != T(1.0f)) {
-      queue.wait();
-      dpct::dpct_free(new_x_ptr);
-    }
-  }
-};
-} // namespace detail
-
-/// Performs internal optimizations for spsv by analyzing the provided matrix
-/// structure and operation parameters.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans_a Specifies operation on input matrix a.
-/// \param [in] a Specifies the sparse matrix a.
-inline void spsv_optimize(sycl::queue queue, oneapi::mkl::transpose trans_a,
-                          sparse_matrix_desc_t a) {
-  if (!a->get_uplo() || !a->get_diag()) {
-    throw std::runtime_error(
-        "dpct::sparse::spsv_optimize(): oneapi::mkl::sparse::optimize_trsv "
-        "needs uplo and diag attributes to be specified.");
-  }
-  oneapi::mkl::sparse::optimize_trsv(
-      queue, a->get_uplo().value(), oneapi::mkl::transpose::nontrans,
-      a->get_diag().value(), a->get_matrix_handle());
-}
-
-/// Solves a system of linear equations for a sparse matrix.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] trans_a Specifies operation on input matrix a.
-/// \param [in] alpha Specifies the scalar alpha.
-/// \param [in] a Specifies the sparse matrix a.
-/// \param [in] x Specifies the dense vector x.
-/// \param [in, out] y Specifies the dense vector y.
-/// \param [in] data_type Specifies the data type of \param a, \param x and
-/// \param y .
-inline void spsv(sycl::queue queue, oneapi::mkl::transpose trans_a,
-                 const void *alpha, sparse_matrix_desc_t a,
-                 std::shared_ptr<dense_vector_desc> x,
-                 std::shared_ptr<dense_vector_desc> y,
-                 library_data_t data_type) {
-  if (!a->get_uplo() || !a->get_diag()) {
-    throw std::runtime_error(
-        "dpct::sparse::spsv(): oneapi::mkl::sparse::trsv needs uplo and diag "
-        "attributes to be specified.");
-  }
-  oneapi::mkl::uplo uplo = a->get_uplo().value();
-  oneapi::mkl::diag diag = a->get_diag().value();
-  detail::spblas_shim<detail::spsv_impl>(a->get_value_type(), queue, uplo, diag,
-                                         trans_a, alpha, a, x, y);
-}
-
-namespace detail {
-template <typename T> struct csr2csc_impl {
-  void operator()(sycl::queue queue, int m, int n, int nnz,
-                  const void *from_val, const int *from_row_ptr,
-                  const int *from_col_ind, void *to_val, int *to_col_ptr,
-                  int *to_row_ind, conversion_scope range,
-                  oneapi::mkl::index_base base) {
-    using Ty = typename dpct::DataType<T>::T2;
-    oneapi::mkl::sparse::matrix_handle_t from_handle = nullptr;
-    oneapi::mkl::sparse::matrix_handle_t to_handle = nullptr;
-    oneapi::mkl::sparse::init_matrix_handle(&from_handle);
-    oneapi::mkl::sparse::init_matrix_handle(&to_handle);
-    auto data_from_row_ptr = dpct::detail::get_memory<int>(from_row_ptr);
-    auto data_from_col_ind = dpct::detail::get_memory<int>(from_col_ind);
-    auto data_from_val = dpct::detail::get_memory<Ty>(from_val);
-    auto data_to_col_ptr = dpct::detail::get_memory<int>(to_col_ptr);
-    auto data_to_row_ind = dpct::detail::get_memory<int>(to_row_ind);
-    void *new_to_value = to_val;
-    if (range == conversion_scope::index) {
-      new_to_value = dpct::dpct_malloc(sizeof(Ty) * nnz);
-    }
-    auto data_to_val = dpct::detail::get_memory<Ty>(new_to_value);
-    oneapi::mkl::sparse::set_csr_data(queue, from_handle, m, n, base,
-                                      data_from_row_ptr, data_from_col_ind,
-                                      data_from_val);
-    oneapi::mkl::sparse::set_csr_data(queue, to_handle, n, m, base,
-                                      data_to_col_ptr, data_to_row_ind,
-                                      data_to_val);
-    sycl::event e1 = oneapi::mkl::sparse::omatcopy(
-        queue, oneapi::mkl::transpose::trans, from_handle, to_handle);
-    oneapi::mkl::sparse::release_matrix_handle(queue, &from_handle, {e1});
-    sycl::event e2 =
-        oneapi::mkl::sparse::release_matrix_handle(queue, &to_handle, {e1});
-    if (range == conversion_scope::index) {
-      dpct::async_dpct_free({new_to_value}, {e2}, queue);
-    }
-  }
-};
-} // namespace detail
-
-/// Convert a CSR sparse matrix to a CSC sparse matrix.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] m Number of rows of the matrix.
-/// \param [in] n Number of columns of the matrix.
-/// \param [in] nnz Number of non-zero elements.
-/// \param [in] from_val An array containing the non-zero elements of the input
-/// matrix.
-/// \param [in] from_row_ptr An array of length \p m + 1.
-/// \param [in] from_col_ind An array containing the column indices in
-/// index-based numbering.
-/// \param [out] to_val An array containing the non-zero elements of the output
-/// matrix.
-/// \param [out] to_col_ptr An array of length \p n + 1.
-/// \param [out] to_row_ind An array containing the row indices in index-based
-/// numbering.
-/// \param [in] range Specifies the conversion scope.
-/// \param [in] base Specifies the index base.
-template <typename T>
-inline void csr2csc(sycl::queue queue, int m, int n, int nnz, const T *from_val,
-                    const int *from_row_ptr, const int *from_col_ind, T *to_val,
-                    int *to_col_ptr, int *to_row_ind, conversion_scope range,
-                    oneapi::mkl::index_base base) {
-  detail::csr2csc_impl<T>()(queue, m, n, nnz, from_val, from_row_ptr,
-                            from_col_ind, to_val, to_col_ptr, to_row_ind, range,
-                            base);
-}
-
-/// Convert a CSR sparse matrix to a CSC sparse matrix.
-/// \param [in] queue The queue where the routine should be executed. It must
-/// have the in_order property when using the USM mode.
-/// \param [in] m Number of rows of the matrix.
-/// \param [in] n Number of columns of the matrix.
-/// \param [in] nnz Number of non-zero elements.
-/// \param [in] from_val An array containing the non-zero elements of the input
-/// matrix.
-/// \param [in] from_row_ptr An array of length \p m + 1.
-/// \param [in] from_col_ind An array containing the column indices in
-/// index-based numbering.
-/// \param [out] to_val An array containing the non-zero elements of the output
-/// matrix.
-/// \param [out] to_col_ptr An array of length \p n + 1.
-/// \param [out] to_row_ind An array containing the row indices in index-based
-/// numbering.
-/// \param [in] value_type Data type of \p from_val and \p to_val .
-/// \param [in] range Specifies the conversion scope.
-/// \param [in] base Specifies the index base.
-inline void csr2csc(sycl::queue queue, int m, int n, int nnz,
-                    const void *from_val, const int *from_row_ptr,
-                    const int *from_col_ind, void *to_val, int *to_col_ptr,
-                    int *to_row_ind, library_data_t value_type,
-                    conversion_scope range, oneapi::mkl::index_base base) {
-  detail::spblas_shim<detail::csr2csc_impl>(
-      value_type, queue, m, n, nnz, from_val, from_row_ptr, from_col_ind,
-      to_val, to_col_ptr, to_row_ind, range, base);
-}
-#endif
-} // namespace sparse
-} // namespace dpct
-
-#endif // __DPCT_SPARSE_UTILS_HPP__
diff --git a/dpct/util.hpp b/dpct/util.hpp
deleted file mode 100644
index 9af7c924701b7..0000000000000
--- a/dpct/util.hpp
+++ /dev/null
@@ -1,1070 +0,0 @@
-//==---- util.hpp ---------------------------------*- C++ -*----------------==//
-//
-// Copyright (C) Intel Corporation
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// See https://llvm.org/LICENSE.txt for license information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef __DPCT_UTIL_HPP__
-#define __DPCT_UTIL_HPP__
-
-#include <sycl/sycl.hpp>
-#include <complex>
-#include <type_traits>
-#include <cassert>
-#include <cstdint>
-
-// TODO: Remove these function definitions once they exist in the DPC++ compiler
-#if defined(__SYCL_DEVICE_ONLY__) && defined(__INTEL_LLVM_COMPILER)
-template <typename T>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT __attribute__((noduplicate))
-T __spirv_GroupNonUniformShuffle(__spv::Scope::Flag, T, unsigned) noexcept;
-
-template <typename T>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT __attribute__((noduplicate))
-T __spirv_GroupNonUniformShuffleDown(__spv::Scope::Flag, T, unsigned) noexcept;
-
-template <typename T>
-__SYCL_CONVERGENT__ extern SYCL_EXTERNAL __SYCL_EXPORT __attribute__((noduplicate))
-T __spirv_GroupNonUniformShuffleUp(__spv::Scope::Flag, T, unsigned) noexcept;
-#endif
-
-namespace dpct {
-
-namespace detail {
-
-template <typename tag, typename T> class generic_error_type {
-public:
-  generic_error_type() = default;
-  generic_error_type(T value) : value{value} {}
-  operator T() const { return value; }
-
-private:
-  T value;
-};
-
-} // namespace detail
-
-using err0 = detail::generic_error_type<struct err0_tag, int>;
-using err1 = detail::generic_error_type<struct err1_tag, int>;
-
-template <int... Ints> struct integer_sequence {};
-template <int Size, int... Ints>
-struct make_index_sequence
-    : public make_index_sequence<Size - 1, Size - 1, Ints...> {};
-template <int... Ints>
-struct make_index_sequence<0, Ints...> : public integer_sequence<Ints...> {};
-
-template <typename T> struct DataType { using T2 = T; };
-template <typename T> struct DataType<sycl::vec<T, 2>> {
-  using T2 = std::complex<T>;
-};
-
-inline void matrix_mem_copy(void *to_ptr, const void *from_ptr, int to_ld,
-                            int from_ld, int rows, int cols, int elem_size,
-                            memcpy_direction direction = automatic,
-                            sycl::queue &queue = dpct::get_default_queue(),
-                            bool async = false) {
-  if (to_ptr == from_ptr && to_ld == from_ld) {
-    return;
-  }
-
-  if (to_ld == from_ld) {
-    size_t copy_size = elem_size * ((cols - 1) * (size_t)to_ld + rows);
-    if (async)
-      detail::dpct_memcpy(queue, (void *)to_ptr, (void *)from_ptr,
-                          copy_size, direction);
-    else
-      detail::dpct_memcpy(queue, (void *)to_ptr, (void *)from_ptr,
-                          copy_size, direction).wait();
-  } else {
-    if (async)
-      detail::dpct_memcpy(queue, to_ptr, from_ptr, elem_size * to_ld,
-                          elem_size * from_ld, elem_size * rows, cols,
-                          direction);
-    else
-      sycl::event::wait(detail::dpct_memcpy(
-          queue, to_ptr, from_ptr, elem_size * to_ld, elem_size * from_ld,
-          elem_size * rows, cols, direction));
-  }
-}
-
-/// Copy matrix data. The default leading dimension is column.
-/// \param [out] to_ptr A pointer points to the destination location.
-/// \param [in] from_ptr A pointer points to the source location.
-/// \param [in] to_ld The leading dimension the destination matrix.
-/// \param [in] from_ld The leading dimension the source matrix.
-/// \param [in] rows The number of rows of the source matrix.
-/// \param [in] cols The number of columns of the source matrix.
-/// \param [in] direction The direction of the data copy.
-/// \param [in] queue The queue where the routine should be executed.
-/// \param [in] async If this argument is true, the return of the function
-/// does NOT guarantee the copy is completed.
-template <typename T>
-inline void matrix_mem_copy(T *to_ptr, const T *from_ptr, int to_ld,
-                            int from_ld, int rows, int cols,
-                            memcpy_direction direction = automatic,
-                            sycl::queue &queue = dpct::get_default_queue(),
-                            bool async = false) {
-  using Ty = typename DataType<T>::T2;
-  matrix_mem_copy((void *)to_ptr, (void *)from_ptr, to_ld, from_ld, rows, cols,
-                  sizeof(Ty), direction, queue, async);
-}
-
-/// Cast the high or low 32 bits of a double to an integer.
-/// \param [in] d The double value.
-/// \param [in] use_high32 Cast the high 32 bits of the double if true;
-/// otherwise cast the low 32 bits.
-inline int cast_double_to_int(double d, bool use_high32 = true) {
-  sycl::vec<double, 1> v0{d};
-  auto v1 = v0.as<sycl::int2>();
-  if (use_high32)
-    return v1[1];
-  return v1[0];
-}
-
-/// Combine two integers, the first as the high 32 bits and the second
-/// as the low 32 bits, into a double.
-/// \param [in] high32 The integer as the high 32 bits
-/// \param [in] low32 The integer as the low 32 bits
-inline double cast_ints_to_double(int high32, int low32) {
-  sycl::int2 v0{low32, high32};
-  auto v1 = v0.as<sycl::vec<double, 1>>();
-  return v1;
-}
-
-/// Reverse the bit order of an unsigned integer
-/// \param [in] a Input unsigned integer value
-/// \returns Value of a with the bit order reversed
-template <typename T> inline T reverse_bits(T a) {
-  static_assert(std::is_unsigned<T>::value && std::is_integral<T>::value,
-                "unsigned integer required");
-  if (!a)
-    return 0;
-  T mask = 0;
-  size_t count = 4 * sizeof(T);
-  mask = ~mask >> count;
-  while (count) {
-    a = ((a & mask) << count) | ((a & ~mask) >> count);
-    count = count >> 1;
-    mask = mask ^ (mask << count);
-  }
-  return a;
-}
-
-/// \param [in] a The first value contains 4 bytes
-/// \param [in] b The second value contains 4 bytes
-/// \param [in] s The selector value, only lower 16bit used
-/// \returns the permutation result of 4 bytes selected in the way
-/// specified by \p s from \p a and \p b
-inline unsigned int byte_level_permute(unsigned int a, unsigned int b,
-                                       unsigned int s) {
-  unsigned int ret;
-  ret =
-      ((((std::uint64_t)b << 32 | a) >> (s & 0x7) * 8) & 0xff) |
-      (((((std::uint64_t)b << 32 | a) >> ((s >> 4) & 0x7) * 8) & 0xff) << 8) |
-      (((((std::uint64_t)b << 32 | a) >> ((s >> 8) & 0x7) * 8) & 0xff) << 16) |
-      (((((std::uint64_t)b << 32 | a) >> ((s >> 12) & 0x7) * 8) & 0xff) << 24);
-  return ret;
-}
-
-/// Find position of first least significant set bit in an integer.
-/// ffs(0) returns 0.
-///
-/// \param [in] a Input integer value
-/// \returns The position
-template <typename T> inline int ffs(T a) {
-  static_assert(std::is_integral<T>::value, "integer required");
-  return (sycl::ctz(a) + 1) % (sizeof(T) * 8 + 1);
-}
-
-/// select_from_sub_group allows work-items to obtain a copy of a value held by
-/// any other work-item in the sub_group. The input sub_group will be divided
-/// into several logical sub_groups with id range [0, \p logical_sub_group_size
-/// - 1]. Each work-item in logical sub_group gets value from another work-item
-/// whose id is \p remote_local_id. If \p remote_local_id is outside the
-/// logical sub_group id range, \p remote_local_id will modulo with \p
-/// logical_sub_group_size. The \p logical_sub_group_size must be a power of 2
-/// and not exceed input sub_group size.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] remote_local_id Input source work item id
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T select_from_sub_group(sycl::sub_group g, T x, int remote_local_id,
-                        int logical_sub_group_size = 32) {
-  unsigned int start_index =
-      g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
-  return sycl::select_from_group(
-      g, x, start_index + remote_local_id % logical_sub_group_size);
-}
-
-/// shift_sub_group_left move values held by the work-items in a sub_group
-/// directly to another work-item in the sub_group, by shifting values a fixed
-/// number of work-items to the left. The input sub_group will be divided into
-/// several logical sub_groups with id range [0, \p logical_sub_group_size - 1].
-/// Each work-item in logical sub_group gets value from another work-item whose
-/// id is caller's id adds \p delta. If calculated id is outside the logical
-/// sub_group id range, the work-item will get value from itself. The \p
-/// logical_sub_group_size must be a power of 2 and not exceed input sub_group
-/// size.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] delta Input delta
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T shift_sub_group_left(sycl::sub_group g, T x, unsigned int delta,
-                       int logical_sub_group_size = 32) {
-  unsigned int id = g.get_local_linear_id();
-  unsigned int end_index =
-      (id / logical_sub_group_size + 1) * logical_sub_group_size;
-  T result = sycl::shift_group_left(g, x, delta);
-  if ((id + delta) >= end_index) {
-    result = x;
-  }
-  return result;
-}
-
-/// shift_sub_group_right move values held by the work-items in a sub_group
-/// directly to another work-item in the sub_group, by shifting values a fixed
-/// number of work-items to the right. The input sub_group will be divided into
-/// several logical_sub_groups with id range [0, \p logical_sub_group_size - 1].
-/// Each work-item in logical_sub_group gets value from another work-item whose
-/// id is caller's id subtracts \p delta. If calculated id is outside the
-/// logical sub_group id range, the work-item will get value from itself. The \p
-/// logical_sub_group_size must be a power of 2 and not exceed input sub_group
-/// size.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] delta Input delta
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T shift_sub_group_right(sycl::sub_group g, T x, unsigned int delta,
-                        int logical_sub_group_size = 32) {
-  unsigned int id = g.get_local_linear_id();
-  unsigned int start_index =
-      id / logical_sub_group_size * logical_sub_group_size;
-  T result = sycl::shift_group_right(g, x, delta);
-  if ((id - start_index) < delta) {
-    result = x;
-  }
-  return result;
-}
-
-/// permute_sub_group_by_xor permutes values by exchanging values held by pairs
-/// of work-items identified by computing the bitwise exclusive OR of the
-/// work-item id and some fixed mask. The input sub_group will be divided into
-/// several logical sub_groups with id range [0, \p logical_sub_group_size - 1].
-/// Each work-item in logical sub_group gets value from another work-item whose
-/// id is bitwise exclusive OR of the caller's id and \p mask. If calculated id
-/// is outside the logical sub_group id range, the work-item will get value from
-/// itself. The \p logical_sub_group_size must be a power of 2 and not exceed
-/// input sub_group size.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] mask Input mask
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
-                           int logical_sub_group_size = 32) {
-  unsigned int id = g.get_local_linear_id();
-  unsigned int start_index =
-      id / logical_sub_group_size * logical_sub_group_size;
-  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
-  return sycl::select_from_group(g, x,
-                                 target_offset < logical_sub_group_size
-                                     ? start_index + target_offset
-                                     : id);
-}
-
-/// The function match_any_over_sub_group conducts a comparison of values
-/// across work-items within a sub-group. match_any_over_sub_group return a mask
-/// in which some bits are set to 1, indicating that the \p value provided by
-/// the work-item represented by these bits are equal. The n-th bit of mask
-/// representing the work-item with id n. The parameter \p member_mask
-/// indicating the work-items participating the call.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] member_mask Input mask
-/// \param [in] value Input value
-/// \returns The result
-template <typename T>
-unsigned int match_any_over_sub_group(sycl::sub_group g, unsigned member_mask,
-                                      T value) {
-  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type.");                    
-  if (!member_mask) {
-    return 0;
-  }
-  unsigned int id = g.get_local_linear_id();
-  unsigned int flag = 0, result = 0, reduce_result = 0;
-  unsigned int bit_index = 0x1 << id;
-  bool is_participate = member_mask & bit_index;
-  T broadcast_value = 0;
-  bool matched = false;
-  while (flag != member_mask) {
-    broadcast_value =
-        sycl::select_from_group(g, value, sycl::ctz((~flag & member_mask)));
-    reduce_result = sycl::reduce_over_group(
-        g, is_participate ? (broadcast_value == value ? bit_index : 0) : 0,
-        sycl::plus<>());
-    flag |= reduce_result;
-    matched = reduce_result & bit_index;
-    result = matched * reduce_result + (1 - matched) * result;
-  }
-  return result;
-}
-
-/// The function match_all_over_sub_group conducts a comparison of values
-/// across work-items within a sub-group. match_all_over_sub_group return \p
-/// member_mask and predicate \p pred will be set to 1 if all \p value that
-/// provided by each work-item in \p member_mask are equal, otherwise return 0
-/// and the predicate \p pred will be set to 0. The n-th bit of \p member_mask
-/// representing the work-item with id n. The parameter \p member_mask
-/// indicating the work-items participating the call.
-/// \tparam T Input value type
-/// \param [in] g Input sub_group
-/// \param [in] member_mask Input mask
-/// \param [in] value Input value
-/// \param [out] pred Output predicate
-/// \returns The result
-template <typename T>
-unsigned int match_all_over_sub_group(sycl::sub_group g, unsigned member_mask,
-                                      T value, int *pred) {
-  static_assert(std::is_arithmetic_v<T>, "Value type must be arithmetic type."); 
-  if (!member_mask) {
-    return 0;
-  }
-  unsigned int id = g.get_local_linear_id();
-  unsigned int bit_index = 0x1 << id;
-  bool is_participate = member_mask & bit_index;
-  T broadcast_value = sycl::select_from_group(g, value, sycl::ctz(member_mask));
-  unsigned int reduce_result = sycl::reduce_over_group(
-      g,
-      (member_mask & bit_index) ? (broadcast_value == value ? bit_index : 0)
-                                : 0,
-      sycl::plus<>());
-  bool all_equal = (reduce_result == member_mask);
-  *pred = is_participate & all_equal;
-  return all_equal * member_mask;
-}
-
-namespace experimental {
-#if defined(__NVPTX__) && defined(__SYCL_DEVICE_ONLY__)
-#define SHFL_SYNC(RES, MASK, VAL, SHFL_PARAM, C, SHUFFLE_INSTR)                \
-  if constexpr (std::is_same_v<T, double>) {                                   \
-    int x_a, x_b;                                                              \
-    asm("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "d"(VAL));              \
-    auto tmp_a = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, x_a, SHFL_PARAM, C);   \
-    auto tmp_b = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, x_b, SHFL_PARAM, C);   \
-    asm("mov.b64 %0,{%1,%2};" : "=d"(RES) : "r"(tmp_a), "r"(tmp_b));           \
-  } else if constexpr (std::is_same_v<T, long> ||                              \
-                       std::is_same_v<T, unsigned long>) {                     \
-    int x_a, x_b;                                                              \
-    asm("mov.b64 {%0,%1},%2;" : "=r"(x_a), "=r"(x_b) : "l"(VAL));              \
-    auto tmp_a = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, x_a, SHFL_PARAM, C);   \
-    auto tmp_b = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, x_b, SHFL_PARAM, C);   \
-    asm("mov.b64 %0,{%1,%2};" : "=l"(RES) : "r"(tmp_a), "r"(tmp_b));           \
-  } else if constexpr (std::is_same_v<T, sycl::half>) {                        \
-    short tmp_b16;                                                             \
-    asm("mov.b16 %0,%1;" : "=h"(tmp_b16) : "h"(VAL));                          \
-    auto tmp_b32 = __nvvm_shfl_sync_##SHUFFLE_INSTR(                           \
-        MASK, static_cast<int>(tmp_b16), SHFL_PARAM, C);                       \
-    asm("mov.b16 %0,%1;" : "=h"(RES) : "h"(static_cast<short>(tmp_b32)));      \
-  } else if constexpr (std::is_same_v<T, float>) {                             \
-    auto tmp_b32 = __nvvm_shfl_sync_##SHUFFLE_INSTR(                           \
-        MASK, __nvvm_bitcast_f2i(VAL), SHFL_PARAM, C);                         \
-    RES = __nvvm_bitcast_i2f(tmp_b32);                                         \
-  } else {                                                                     \
-    RES = __nvvm_shfl_sync_##SHUFFLE_INSTR(MASK, VAL, SHFL_PARAM, C);          \
-  }
-#endif
-/// Masked version of select_from_sub_group, which execute masked sub-group
-/// operation. The parameter member_mask indicating the work-items participating
-/// the call. Whether the n-th bit is set to 1 representing whether the
-/// work-item with id n is participating the call. All work-items named in
-/// member_mask must be executed with the same member_mask, or the result is
-/// undefined.
-/// \tparam T Input value type
-/// \param [in] member_mask Input mask
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] remote_local_id Input source work item id
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T select_from_sub_group(unsigned int member_mask,
-                        sycl::sub_group g, T x, int remote_local_id,
-                        int logical_sub_group_size = 32) {
-#if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__SPIR__)
-  unsigned int start_index =
-      g.get_local_linear_id() / logical_sub_group_size * logical_sub_group_size;
-  unsigned logical_remote_id =
-      start_index + remote_local_id % logical_sub_group_size;
-  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x, logical_remote_id);
-#elif defined(__NVPTX__)
-  T result;
-  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
-  SHFL_SYNC(result, member_mask, x, remote_local_id, cVal, idx_i32)
-  return result;
-#endif
-#else
-  (void)g;
-  (void)x;
-  (void)remote_local_id;
-  (void)logical_sub_group_size;
-  (void)member_mask;
-  throw sycl::exception(sycl::errc::runtime, "Masked version of select_from_sub_group not "
-                        "supported on host device.");
-#endif // __SYCL_DEVICE_ONLY__
-}
-
-/// Masked version of shift_sub_group_left, which execute masked sub-group
-/// operation. The parameter member_mask indicating the work-items participating
-/// the call. Whether the n-th bit is set to 1 representing whether the
-/// work-item with id n is participating the call. All work-items named in
-/// member_mask must be executed with the same member_mask, or the result is
-/// undefined.
-/// \tparam T Input value type
-/// \param [in] member_mask Input mask
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] delta Input delta
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T shift_sub_group_left(unsigned int member_mask,
-                       sycl::sub_group g, T x, unsigned int delta,
-                       int logical_sub_group_size = 32) {
-#if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__SPIR__)
-  unsigned int id = g.get_local_linear_id();
-  unsigned int end_index =
-      (id / logical_sub_group_size + 1) * logical_sub_group_size;
-  T result = __spirv_GroupNonUniformShuffleDown(__spv::Scope::Subgroup, x, delta);
-  if ((id + delta) >= end_index) {
-    result = x;
-  }
-  return result;
-#elif defined(__NVPTX__)
-  T result;
-  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
-  SHFL_SYNC(result, member_mask, x, delta, cVal, down_i32)
-  return result;
-#endif
-#else
-  (void)g;
-  (void)x;
-  (void)delta;
-  (void)logical_sub_group_size;
-  (void)member_mask;
-  throw sycl::exception(sycl::errc::runtime, "Masked version of select_from_sub_group not "
-                        "supported on host device.");
-#endif // __SYCL_DEVICE_ONLY__
-}
-
-/// Masked version of shift_sub_group_right, which execute masked sub-group
-/// operation. The parameter member_mask indicating the work-items participating
-/// the call. Whether the n-th bit is set to 1 representing whether the
-/// work-item with id n is participating the call. All work-items named in
-/// member_mask must be executed with the same member_mask, or the result is
-/// undefined.
-/// \tparam T Input value type
-/// \param [in] member_mask Input mask
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] delta Input delta
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T shift_sub_group_right(unsigned int member_mask,
-                        sycl::sub_group g, T x, unsigned int delta,
-                        int logical_sub_group_size = 32) {
-#if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__SPIR__)
-  unsigned int id = g.get_local_linear_id();
-  unsigned int start_index =
-      id / logical_sub_group_size * logical_sub_group_size;
-  T result = __spirv_GroupNonUniformShuffleUp(__spv::Scope::Subgroup, x, delta);
-  if ((id - start_index) < delta) {
-    result = x;
-  }
-  return result;
-#elif defined(__NVPTX__)
-  T result;
-  int cVal = ((32 - logical_sub_group_size) << 8);
-  SHFL_SYNC(result, member_mask, x, delta, cVal, up_i32)
-  return result;
-#endif
-#else
-  (void)g;
-  (void)x;
-  (void)delta;
-  (void)logical_sub_group_size;
-  (void)member_mask;
-  throw sycl::exception(sycl::errc::runtime, "Masked version of select_from_sub_group not "
-                        "supported on host device.");
-#endif // __SYCL_DEVICE_ONLY__
-}
-
-/// Masked version of permute_sub_group_by_xor, which execute masked sub-group
-/// operation. The parameter member_mask indicating the work-items participating
-/// the call. Whether the n-th bit is set to 1 representing whether the
-/// work-item with id n is participating the call. All work-items named in
-/// member_mask must be executed with the same member_mask, or the result is
-/// undefined.
-/// \tparam T Input value type
-/// \param [in] member_mask Input mask
-/// \param [in] g Input sub_group
-/// \param [in] x Input value
-/// \param [in] mask Input mask
-/// \param [in] logical_sub_group_size Input logical sub_group size
-/// \returns The result
-template <typename T>
-T permute_sub_group_by_xor(unsigned int member_mask,
-                           sycl::sub_group g, T x, unsigned int mask,
-                           int logical_sub_group_size = 32) {
-#if defined(__SYCL_DEVICE_ONLY__)
-#if defined(__SPIR__)
-  unsigned int id = g.get_local_linear_id();
-  unsigned int start_index =
-      id / logical_sub_group_size * logical_sub_group_size;
-  unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
-  unsigned logical_remote_id = (target_offset < logical_sub_group_size) ? start_index + target_offset : id;
-  return __spirv_GroupNonUniformShuffle(__spv::Scope::Subgroup, x, logical_remote_id);
-#elif defined(__NVPTX__)
-  T result;
-  int cVal = ((32 - logical_sub_group_size) << 8) | 31;
-  SHFL_SYNC(result, member_mask, x, mask, cVal, bfly_i32)
-  return result;
-#endif
-#else
-  (void)g;
-  (void)x;
-  (void)mask;
-  (void)logical_sub_group_size;
-  (void)member_mask;
-  throw sycl::exception(sycl::errc::runtime, "Masked version of select_from_sub_group not "
-                        "supported on host device.");
-#endif // __SYCL_DEVICE_ONLY__
-}
-#if defined(__NVPTX__)
-#undef SHFL_SYNC
-#endif
-} // namespace experimental
-
-/// Computes the multiplication of two complex numbers.
-/// \tparam T Complex element type
-/// \param [in] x The first input complex number
-/// \param [in] y The second input complex number
-/// \returns The result
-template <typename T>
-sycl::vec<T, 2> cmul(sycl::vec<T, 2> x, sycl::vec<T, 2> y) {
-  std::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
-  t1 = t1 * t2;
-  return sycl::vec<T, 2>(t1.real(), t1.imag());
-}
-
-/// Computes the division of two complex numbers.
-/// \tparam T Complex element type
-/// \param [in] x The first input complex number
-/// \param [in] y The second input complex number
-/// \returns The result
-template <typename T>
-sycl::vec<T, 2> cdiv(sycl::vec<T, 2> x, sycl::vec<T, 2> y) {
-  std::complex<T> t1(x[0], x[1]), t2(y[0], y[1]);
-  t1 = t1 / t2;
-  return sycl::vec<T, 2>(t1.real(), t1.imag());
-}
-
-/// Computes the magnitude of a complex number.
-/// \tparam T Complex element type
-/// \param [in] x The input complex number
-/// \returns The result
-template <typename T>
-T cabs(sycl::vec<T, 2> x) {
-  std::complex<T> t(x[0], x[1]);
-  return std::abs(t);
-}
-
-/// Computes the complex conjugate of a complex number.
-/// \tparam T Complex element type
-/// \param [in] x The input complex number
-/// \returns The result
-template <typename T>
-sycl::vec<T, 2> conj(sycl::vec<T, 2> x) {
-  std::complex<T> t(x[0], x[1]);
-  t = std::conj(t);
-  return sycl::vec<T, 2>(t.real(), t.imag());
-}
-
-inline int get_sycl_language_version() {
-#ifdef SYCL_LANGUAGE_VERSION
-  return SYCL_LANGUAGE_VERSION;
-#else
-  return 202000;
-#endif
-}
-
-namespace experimental {
-/// Synchronize work items from all work groups within a SYCL kernel.
-/// \param [in] item:  Represents a work group.
-/// \param [in] counter: An atomic object defined on a device memory which can
-/// be accessed by work items in all work groups. The initial value of the
-/// counter should be zero.
-/// Note: Please make sure that all the work items of all work groups within
-/// a SYCL kernel can be scheduled actively at the same time on a device.
-template <int dimensions = 3>
-inline void
-nd_range_barrier(const sycl::nd_item<dimensions> &item,
-                 sycl::atomic_ref<
-                     unsigned int, sycl::memory_order::seq_cst,
-                     sycl::memory_scope::device,
-                     sycl::access::address_space::global_space> &counter) {
-
-  static_assert(dimensions == 3, "dimensions must be 3.");
-
-  unsigned int num_groups = item.get_group_range(2) * item.get_group_range(1) *
-                            item.get_group_range(0);
-
-  item.barrier();
-
-  if (item.get_local_linear_id() == 0) {
-    unsigned int inc = 1;
-    unsigned int old_arrive = 0;
-    bool is_group0 =
-        (item.get_group(2) + item.get_group(1) + item.get_group(0) == 0);
-    if (is_group0) {
-      inc = 0x80000000 - (num_groups - 1);
-    }
-
-    old_arrive = counter.fetch_add(inc);
-    // Synchronize all the work groups
-    while (((old_arrive ^ counter.load()) & 0x80000000) == 0)
-      ;
-  }
-
-  item.barrier();
-}
-
-/// Synchronize work items from all work groups within a SYCL kernel.
-/// \param [in] item:  Represents a work group.
-/// \param [in] counter: An atomic object defined on a device memory which can
-/// be accessed by work items in all work groups. The initial value of the
-/// counter should be zero.
-/// Note: Please make sure that all the work items of all work groups within
-/// a SYCL kernel can be scheduled actively at the same time on a device.
-template <>
-inline void
-nd_range_barrier(const sycl::nd_item<1> &item,
-                 sycl::atomic_ref<
-                     unsigned int, sycl::memory_order::seq_cst,
-                     sycl::memory_scope::device,
-                     sycl::access::address_space::global_space> &counter) {
-  unsigned int num_groups = item.get_group_range(0);
-
-  item.barrier();
-
-  if (item.get_local_linear_id() == 0) {
-    unsigned int inc = 1;
-    unsigned int old_arrive = 0;
-    bool is_group0 = (item.get_group(0) == 0);
-    if (is_group0) {
-      inc = 0x80000000 - (num_groups - 1);
-    }
-
-    old_arrive = counter.fetch_add(inc);
-    // Synchronize all the work groups
-    while (((old_arrive ^ counter.load()) & 0x80000000) == 0)
-      ;
-  }
-
-  item.barrier();
-}
-
-/// The logical-group is a logical collection of some work-items within a
-/// work-group.
-/// Note: Please make sure that the logical-group size is a power of 2 in the
-/// range [1, current_sub_group_size].
-template <int dimensions = 3> class logical_group {
-  sycl::nd_item<dimensions> _item;
-  sycl::group<dimensions> _g;
-  uint32_t _logical_group_size;
-  uint32_t _group_linear_range_in_parent;
-
-public:
-  /// Dividing \p parent_group into several logical-groups.
-  /// \param [in] item Current work-item.
-  /// \param [in] parent_group The group to be divided.
-  /// \param [in] size The logical-group size.
-  logical_group(sycl::nd_item<dimensions> item,
-                sycl::group<dimensions> parent_group, uint32_t size)
-      : _item(item), _g(parent_group), _logical_group_size(size) {
-    _group_linear_range_in_parent =
-        (_g.get_local_linear_range() - 1) / _logical_group_size + 1;
-  }
-  logical_group(sycl::nd_item<dimensions> item)
-      : _item(item), _g(item.get_group()) {}
-  /// Returns the index of the work-item within the logical-group.
-  uint32_t get_local_linear_id() const {
-    return _item.get_local_linear_id() % _logical_group_size;
-  }
-  /// Returns the index of the logical-group in the parent group.
-  uint32_t get_group_linear_id() const {
-    return _item.get_local_linear_id() / _logical_group_size;
-  }
-  /// Returns the number of work-items in the logical-group.
-  uint32_t get_local_linear_range() const {
-    if (_g.get_local_linear_range() % _logical_group_size == 0) {
-      return _logical_group_size;
-    }
-    uint32_t last_item_group_id =
-        _g.get_local_linear_range() / _logical_group_size;
-    uint32_t first_of_last_group = last_item_group_id * _logical_group_size;
-    if (_item.get_local_linear_id() >= first_of_last_group) {
-      return _g.get_local_linear_range() - first_of_last_group;
-    } else {
-      return _logical_group_size;
-    }
-  }
-  /// Returns the number of logical-group in the parent group.
-  uint32_t get_group_linear_range() const {
-    return _group_linear_range_in_parent;
-  }
-};
-
-// The original source of the functions calculate_max_active_wg_per_xecore and
-// calculate_max_potential_wg were under the license below:
-//
-// Copyright (C) Intel Corporation
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-//
-/// This function is used for occupancy calculation, it computes the max active
-/// work-group number per Xe-Core. Ref to
-/// https://github.com/oneapi-src/oneAPI-samples/tree/master/Tools/GPU-Occupancy-Calculator
-/// \param [out] num_wg Active work-group number.
-/// \param [in] wg_size Work-group size.
-/// \param [in] slm_size Share local memory size.
-/// \param [in] sg_size Sub-group size.
-/// \param [in] used_barrier Whether barrier is used.
-/// \param [in] used_large_grf Whether large General Register File is used.
-/// \return If no error, returns 0.
-/// If \p wg_size exceeds the max work-group size, the max work-group size will
-/// be used instead of \p wg_size and returns -1.
-inline int calculate_max_active_wg_per_xecore(int *num_wg, int wg_size,
-                                              int slm_size = 0,
-                                              int sg_size = 32,
-                                              bool used_barrier = false,
-                                              bool used_large_grf = false) {
-  int ret = 0;
-  const int slm_size_per_xe_core = 64 * 1024;
-  const int max_barrier_registers = 32;
-  dpct::device_ext &dev = dpct::get_current_device();
-
-  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
-  if (wg_size > max_wg_size) {
-    wg_size = max_wg_size;
-    ret = -1;
-  }
-
-  int num_threads_ss = 56;
-  int max_num_wg = 56;
-  if (dev.has(sycl::aspect::ext_intel_gpu_eu_count_per_subslice) &&
-      dev.has(sycl::aspect::ext_intel_gpu_hw_threads_per_eu)) {
-    auto eu_count =
-        dev.get_info<sycl::info::device::ext_intel_gpu_eu_count_per_subslice>();
-    auto threads_count =
-        dev.get_info<sycl::ext::intel::info::device::gpu_hw_threads_per_eu>();
-    num_threads_ss = eu_count * threads_count;
-    max_num_wg = eu_count * threads_count;
-  }
-
-  if (used_barrier) {
-    max_num_wg = max_barrier_registers;
-  }
-
-  // Calculate num_wg_slm
-  int num_wg_slm = 0;
-  if (slm_size == 0) {
-    num_wg_slm = max_num_wg;
-  } else {
-    num_wg_slm = std::floor((float)slm_size_per_xe_core / slm_size);
-  }
-
-  // Calculate num_wg_threads
-  if (used_large_grf)
-    num_threads_ss = num_threads_ss / 2;
-  int num_threads = std::ceil((float)wg_size / sg_size);
-  int num_wg_threads = std::floor((float)num_threads_ss / num_threads);
-
-  // Calculate num_wg
-  *num_wg = std::min(num_wg_slm, num_wg_threads);
-  *num_wg = std::min(*num_wg, max_num_wg);
-  return ret;
-}
-
-/// This function is used for occupancy calculation, it computes the work-group
-/// number and the work-group size which achieves the maximum occupancy of the
-/// device potentially. Ref to
-/// https://github.com/oneapi-src/oneAPI-samples/tree/master/Tools/GPU-Occupancy-Calculator
-/// \param [out] num_wg Work-group number.
-/// \param [out] wg_size Work-group size.
-/// \param [in] max_ws_size_for_device_code The maximum working work-group size
-/// for current device code logic. Zero means no limitation.
-/// \param [in] slm_size Share local memory size.
-/// \param [in] sg_size Sub-group size.
-/// \param [in] used_barrier Whether barrier is used.
-/// \param [in] used_large_grf Whether large General Register File is used.
-/// \return Returns 0.
-inline int calculate_max_potential_wg(int *num_wg, int *wg_size,
-                                      int max_ws_size_for_device_code,
-                                      int slm_size = 0, int sg_size = 32,
-                                      bool used_barrier = false,
-                                      bool used_large_grf = false) {
-  sycl::device &dev = dpct::get_current_device();
-  size_t max_wg_size = dev.get_info<sycl::info::device::max_work_group_size>();
-  if (max_ws_size_for_device_code == 0 ||
-      max_ws_size_for_device_code >= max_wg_size)
-    *wg_size = (int)max_wg_size;
-  else
-    *wg_size = max_ws_size_for_device_code;
-  calculate_max_active_wg_per_xecore(num_wg, *wg_size, slm_size, sg_size,
-                                     used_barrier, used_large_grf);
-  std::uint32_t num_ss = 1;
-  if (dev.has(sycl::aspect::ext_intel_gpu_slices) &&
-      dev.has(sycl::aspect::ext_intel_gpu_subslices_per_slice)) {
-    num_ss =
-        dev.get_info<sycl::ext::intel::info::device::gpu_slices>() *
-        dev.get_info<sycl::ext::intel::info::device::gpu_subslices_per_slice>();
-  }
-  num_wg[0] = num_ss * num_wg[0];
-  return 0;
-}
-
-/// Supported group type during migration.
-enum class group_type { work_group, sub_group, logical_group, root_group };
-
-/// The group_base will dispatch the function call to the specific interface
-/// based on the group type.
-template <int dimensions = 3> class group_base {
-public:
-  group_base(sycl::nd_item<dimensions> item)
-      : nd_item(item), logical_group(item) {}
-  ~group_base() {}
-  /// Returns the number of work-items in the group.
-  size_t get_local_linear_range() {
-    switch (type) {
-    case group_type::work_group:
-      return nd_item.get_group().get_local_linear_range();
-    case group_type::sub_group:
-      return nd_item.get_sub_group().get_local_linear_range();
-    case group_type::logical_group:
-      return logical_group.get_local_linear_range();
-    default:
-      return -1; // Unkonwn group type
-    }
-  }
-  /// Returns the index of the work-item within the group.
-  size_t get_local_linear_id() {
-    switch (type) {
-    case group_type::work_group:
-      return nd_item.get_group().get_local_linear_id();
-    case group_type::sub_group:
-      return nd_item.get_sub_group().get_local_linear_id();
-    case group_type::logical_group:
-      return logical_group.get_local_linear_id();
-    default:
-      return -1; // Unkonwn group type
-    }
-  }
-  /// Wait for all the elements within the group to complete their execution
-  /// before proceeding.
-  void barrier() {
-    switch (type) {
-    case group_type::work_group:
-      sycl::group_barrier(nd_item.get_group());
-      break;
-    case group_type::sub_group:
-    case group_type::logical_group:
-      sycl::group_barrier(nd_item.get_sub_group());
-      break;
-    default:
-      break;
-    }
-  }
-
-protected:
-  logical_group<dimensions> logical_group;
-  sycl::nd_item<dimensions> nd_item;
-  group_type type;
-};
-
-/// The group class is a container type that can storage supported group_type.
-template <typename T, int dimensions = 3>
-class group : public group_base<dimensions> {
-  using group_base<dimensions>::type;
-  using group_base<dimensions>::logical_group;
-
-public:
-  group(T g, sycl::nd_item<dimensions> item) : group_base<dimensions>(item) {
-    if constexpr (std::is_same_v<T, sycl::sub_group>) {
-      type = group_type::sub_group;
-    } else if constexpr (std::is_same_v<T, sycl::group<dimensions>>) {
-      type = group_type::work_group;
-    } else if constexpr (std::is_same_v<T, dpct::experimental::logical_group<
-                                               dimensions>>) {
-      logical_group = g;
-      type = group_type::logical_group;
-    }
-  }
-};
-} // namespace experimental
-
-/// If x <= 2, then return a pointer to the deafult queue;
-/// otherwise, return x reinterpreted as a dpct::queue_ptr.
-inline queue_ptr int_as_queue_ptr(uintptr_t x) {
-  return x <= 2 ?
-  &get_default_queue()
-  : reinterpret_cast<queue_ptr>(x);
-}
-
-template <int n_nondefault_params, int n_default_params, typename T>
-class args_selector;
-
-/// args_selector is a helper class for extracting arguments from an
-/// array of pointers to arguments or buffer of arguments to pass to a
-/// kernel function.
-///
-/// \param R(Ts...) The type of the kernel
-/// \param n_nondefault_params The number of nondefault parameters of the kernel
-/// (excluding parameters that like sycl::nd_item, etc.)
-/// \param n_default_params The number of default parameters of the kernel
-///
-/// Example usage:
-/// With the following kernel:
-///   void foo(sycl::float2 *x, int n, sycl::nd_item<3> item_ct1, float f=.1) {}
-/// and with the declaration:
-///   args_selector<2, 1, decltype(foo)> selector(kernelParams, extra);
-/// we have:
-///   selector.get<0>() returns a reference to sycl::float*,
-///   selector.get<1>() returns a reference to int,
-///   selector.get<2>() returns a reference to float
-template <int n_nondefault_params, int n_default_params,
-   typename R, typename... Ts>
-class args_selector<n_nondefault_params, n_default_params, R(Ts...)> {
-private:
-  void **kernel_params;
-  char *args_buffer;
-
-  template <int i>
-  static constexpr int account_for_default_params() {
-    constexpr int n_total_params = sizeof...(Ts);
-    if constexpr (i >= n_nondefault_params) {
-      return n_total_params - n_default_params + (i - n_nondefault_params);
-    } else {
-      return i;
-    }
-  }    
-
-public:
-  /// Get the type of the ith argument of R(Ts...)
-  /// \param [in] i Index of parameter to get
-  /// \returns Type of ith parameter
-  template <int i>
-  using arg_type = std::tuple_element_t<account_for_default_params<i>(),
-					  std::tuple<Ts...>>;
-private:
-  template <int i>
-  static constexpr int get_offset() {
-    if constexpr (i == 0) {
-      // we can assume args_buffer is properly aligned to the
-      // first argument
-      return 0;
-    } else {
-      constexpr int prev_off = get_offset<i-1>();
-      constexpr int prev_past_end = prev_off + sizeof(arg_type<i-1>);
-      using T = arg_type<i>;
-      // is the past-the-end of the i-1st element properly aligned
-      // with the ith element's alignment?
-      if constexpr (prev_past_end % alignof(T) == 0) {
-	return prev_past_end;
-      }
-      // otherwise bump prev_past_end to match alignment
-      else {
-	return prev_past_end + (alignof(T) - (prev_past_end % alignof(T)));
-      }
-    }
-  }
-
-  static char *get_args_buffer(void **extra) {
-    if (!extra)
-      return nullptr;
-    for (; (std::size_t) *extra != 0; ++extra) {
-      if ((std::size_t) *extra == 1) {
-	return static_cast<char*>(*(extra+1));
-      }
-    }
-    return nullptr;
-  }
-    
-public:
-  /// If kernel_params is nonnull, then args_selector will
-  /// extract arguments from kernel_params. Otherwise, it
-  /// will extract them from extra.
-  /// \param [in] kernel_params Array of pointers to arguments
-  /// a or null pointer.
-  /// \param [in] extra Array containing pointer to argument buffer.
-  args_selector(void **kernel_params, void **extra)
-    : kernel_params(kernel_params),
-      args_buffer(get_args_buffer(extra))
-  {}
-
-  /// Get a reference to the ith argument extracted from kernel_params
-  /// or extra.
-  /// \param [in] i Index of argument to get
-  /// \returns Reference to the ith argument
-  template <int i>    
-  arg_type<i> &get() {
-    if (kernel_params) {
-      return *static_cast<arg_type<i>*>(kernel_params[i]);
-    } else {
-      return *reinterpret_cast<arg_type<i>*>(args_buffer + get_offset<i>());
-    }
-  }
-};
-
-#ifdef _WIN32
-#define DPCT_EXPORT __declspec(dllexport)
-#else
-#define DPCT_EXPORT
-#endif
-
-} // namespace dpct
-
-#endif // __DPCT_UTIL_HPP__
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 1931b80942e2a..e159532e0f325 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -19,9 +19,10 @@
 
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
-#include <dpct/dpct.hpp>
-#include <dpct/blas_utils.hpp>
-#include <dpct/lib_common_utils.hpp>
+// #include <dpct/dpct.hpp>
+// #include <dpct/blas_utils.hpp>
+// #include <dpct/lib_common_utils.hpp>
+#include "dpct.hpp"
 #include "ggml-sycl.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
diff --git a/run.sh b/run.sh
index 254a18efc2c65..6e08fac7d9a4b 100755
--- a/run.sh
+++ b/run.sh
@@ -14,6 +14,6 @@ echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
 #export GGML_SYCL_DEBUG=1
 #export GGML_SYCL_LIST_DEVICE=1
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
 #./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
 

From c67c2ab228dc25ff0c73127232ea1c99a150e294 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sat, 13 Jan 2024 21:14:46 +0800
Subject: [PATCH 20/90] refactor device log

---
 ggml-sycl.cpp | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index e159532e0f325..97398a209d1c6 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -8358,7 +8358,7 @@ bool ggml_sycl_loaded(void) {
 }
 void print_devices(){
     int device_count = dpct::dev_mgr::instance().device_count();
-    fprintf(stderr, "\n%s: found %d SYCL devices:\n", __func__, device_count);
+    fprintf(stderr, "found %d SYCL devices:\n", device_count);
     for (int id = 0; id < device_count; ++id) {
         dpct::device_info prop;
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
@@ -8373,7 +8373,7 @@ void print_devices(){
                 prop.get_global_mem_size()
                 );
     }
-    fprintf(stderr, "\n");
+    // fprintf(stderr, "\n");
 }
 
 int get_sycl_env(const char* env_name, int default_val){
@@ -8414,8 +8414,6 @@ void ggml_init_sycl() try {
 
         int user_device_id = get_sycl_env("GGML_SYCL_DEVICE", 0);
 
-        print_devices();
-
         if (CHECK_TRY_ERROR(g_all_sycl_device_count =
                                  dpct::dev_mgr::instance().device_count()) !=
             0) {
@@ -8423,7 +8421,6 @@ void ggml_init_sycl() try {
             g_sycl_loaded = false;
             return;
         }
-
         GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES);
         int64_t total_vram = 0;
 
@@ -8439,7 +8436,7 @@ void ggml_init_sycl() try {
 #else
         fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
 #endif
-
+        print_devices();
         for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
             g_sycl_device_id2index[id].index = -1;
             g_device_caps[id].vmm = 0;
@@ -8448,7 +8445,6 @@ void ggml_init_sycl() try {
             g_tensor_split[id] = 0;
         }
 
-
         int device_inx = -1;
         for (int id = 0; id < g_all_sycl_device_count; ++id) {
             if(id!=user_device_id) continue;
@@ -8464,10 +8460,10 @@ void ggml_init_sycl() try {
             SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
                 prop, dpct::dev_mgr::instance().get_device(id))));
 
-            fprintf(stderr,
-                    "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id,
-                    prop.get_name(), prop.get_major_version(),
-                    prop.get_minor_version(), device_vmm ? "yes" : "no");
+            // fprintf(stderr,
+            //         "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id,
+            //         prop.get_name(), prop.get_major_version(),
+            //         prop.get_minor_version(), device_vmm ? "yes" : "no");
 
             g_tensor_split[device_inx] = total_vram;
             total_vram += prop.get_global_mem_size();
@@ -8475,7 +8471,7 @@ void ggml_init_sycl() try {
             g_device_caps[device_inx].cc =
                 100 * prop.get_major_version() + 10 * prop.get_minor_version();
 
-            printf("g_device_caps[%d].cc=%d\n", device_inx, g_device_caps[device_inx].cc);
+            // printf("g_device_caps[%d].cc=%d\n", device_inx, g_device_caps[device_inx].cc);
         }
         device_inx = -1;
         for (int id = 0; id < g_all_sycl_device_count; ++id) {
@@ -8520,7 +8516,7 @@ void ggml_init_sycl() try {
         ggml_sycl_set_main_device(user_device_id);
         ggml_sycl_set_device(user_device_id);
         g_work_group_size = get_work_group_size(user_device_id);
-        fprintf(stderr, "Using Device %d\n", user_device_id);
+        // fprintf(stderr, "Using Device %d\n", user_device_id);
 
         // for (int id = 0; id < g_all_sycl_device_count; ++id) {
         //     GGML_SYCL_DEBUG("id=%d  g_device_caps[%d].device_id=%d g_sycl_device_id2index[%d].index=%d ", id, id,
@@ -11396,7 +11392,7 @@ void ggml_sycl_set_main_device(const int main_device) try {
         dpct::device_info prop;
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info(
             prop, dpct::dev_mgr::instance().get_device(g_main_device))));
-        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
+        fprintf(stderr, "Using device %d (%s) as main device\n",
                 g_main_device, prop.get_name());
     }
 }

From c3c5b20ac5dcc6dc4f2afdecd2a12fa2c940a28b Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 10:01:32 +0800
Subject: [PATCH 21/90] mv dpct definition from folder dpct to ggml-sycl.h

---
 dpcpp_out2/MainSourceFiles.yaml | 18145 ------------------------------
 dpcpp_out2/ggml-alloc.h         |    92 -
 dpcpp_out2/ggml-backend-impl.h  |   116 -
 dpcpp_out2/ggml-backend.h       |   188 -
 dpcpp_out2/ggml-cuda.dp.cpp     | 12724 ---------------------
 dpcpp_out2/ggml-cuda.h          |    64 -
 dpcpp_out2/ggml.h               |  2253 ----
 dpcpp_out2/ggml.h.yaml          |   100 -
 dpct.hpp                        |  2831 -----
 ggml-sycl.cpp                   |     5 +-
 ggml-sycl.h                     |  2807 +++++
 11 files changed, 2808 insertions(+), 36517 deletions(-)
 delete mode 100644 dpcpp_out2/MainSourceFiles.yaml
 delete mode 100644 dpcpp_out2/ggml-alloc.h
 delete mode 100644 dpcpp_out2/ggml-backend-impl.h
 delete mode 100644 dpcpp_out2/ggml-backend.h
 delete mode 100644 dpcpp_out2/ggml-cuda.dp.cpp
 delete mode 100644 dpcpp_out2/ggml-cuda.h
 delete mode 100644 dpcpp_out2/ggml.h
 delete mode 100644 dpcpp_out2/ggml.h.yaml
 delete mode 100644 dpct.hpp

diff --git a/dpcpp_out2/MainSourceFiles.yaml b/dpcpp_out2/MainSourceFiles.yaml
deleted file mode 100644
index 472f76ce1182e..0000000000000
--- a/dpcpp_out2/MainSourceFiles.yaml
+++ /dev/null
@@ -1,18145 +0,0 @@
----
-MainSourceFile:  '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/MainSrcFiles_placehold'
-Replacements:
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          0
-    Length:          0
-    ReplacementText: "#define DPCT_PROFILING_ENABLED\n#define DPCT_COMPAT_RT_VERSION 12010\n#include <sycl/sycl.hpp>\n#include <dpct/dpct.hpp>\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211
-    Length:          0
-    ReplacementText: "\n#include <dpct/blas_utils.hpp>\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          4081
-    Length:          26
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          4107
-    Length:          18
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          4125
-    Length:          23
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          4148
-    Length:          23
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          4176
-    Length:          14
-    ReplacementText: DPCT_COMPAT_RT_VERSION
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          4517
-    Length:          0
-    ReplacementText: "\n#include <cmath>\n\n#include <dpct/lib_common_utils.hpp>\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          8376
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          8431
-    Length:          14
-    ReplacementText: DPCT_COMPAT_RT_VERSION
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          8506
-    Length:          14
-    ReplacementText: int
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          8528
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1009:48: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          8543
-    Length:          26
-    ReplacementText: '"cublasGetStatusString is not supported"/*cublasGetStatusString(err)*/'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          9824
-    Length:          0
-    ReplacementText: "/*\nDPCT1001:50: The statement could not be removed.\n*/\n/*\nDPCT1000:51: Error handling if-stmt was detected but could not be rewritten.\n*/\n/*\nDPCT1009:52: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n*/\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          9886
-    Length:          11
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          9961
-    Length:          24
-    ReplacementText: '"cudaGetErrorString is not supported"/*cudaGetErrorString(err_)*/'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          10064
-    Length:          21
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          10249
-    Length:          8
-    ReplacementText: int
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          10291
-    Length:          0
-    ReplacementText: "    /*\n    DPCT1007:49: Migration of cuGetErrorString is not supported.\n    */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          10350
-    Length:          0
-    ReplacementText: "/*\nDPCT1001:67: The statement could not be removed.\n*/\n/*\nDPCT1000:68: Error handling if-stmt was detected but could not be rewritten.\n*/\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          10412
-    Length:          12
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          10540
-    Length:          14
-    ReplacementText: DPCT_COMPAT_RT_VERSION
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          10822
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          10869
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          10880
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          11159
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          11170
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          11451
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          11462
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          11646
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          11657
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          11942
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          12027
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          12774
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          12985
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          13253
-    Length:          0
-    ReplacementText: ' dpct_type_471834'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          13260
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          13542
-    Length:          0
-    ReplacementText: ' dpct_type_143705'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          13549
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          13854
-    Length:          0
-    ReplacementText: ' dpct_type_673649'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          13861
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          14212
-    Length:          0
-    ReplacementText: ' dpct_type_135589'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          14219
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          14593
-    Length:          0
-    ReplacementText: ' dpct_type_122878'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          14600
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          14868
-    Length:          0
-    ReplacementText: ' dpct_type_143721'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          14875
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          15254
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          15390
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          15640
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          15770
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          16066
-    Length:          0
-    ReplacementText: ' dpct_type_619598'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          16183
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          16445
-    Length:          0
-    ReplacementText: ' dpct_type_138576'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          16713
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          17280
-    Length:          0
-    ReplacementText: ' dpct_type_154943'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          17287
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          18042
-    Length:          0
-    ReplacementText: ' dpct_type_866817'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          18049
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          18523
-    Length:          0
-    ReplacementText: ' dpct_type_107281'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          18669
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20185
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20254
-    Length:          7
-    ReplacementText: '&dpct::get_in_order_queue()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20397
-    Length:          11
-    ReplacementText: 'dpct::event_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20636
-    Length:          11
-    ReplacementText: 'dpct::err0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20687
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20728
-    Length:          30
-    ReplacementText: 'DPCT_CHECK_ERROR(current_device = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20813
-    Length:          11
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20833
-    Length:          0
-    ReplacementText: "    /*\n    DPCT1093:53: The \"device\" device may be not the one intended for use. Adjust the selected device if needed.\n    */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20844
-    Length:          13
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::select_device'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20865
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          20868
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21441
-    Length:          14
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21530
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21555
-    Length:          0
-    ReplacementText: 'const sycl::stream &stream_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21563
-    Length:          91
-    ReplacementText: 'stream_ct1 << "ERROR: ggml-cuda was compiled without support for the current GPU architecture.\n"'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21738
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21749
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21794
-    Length:          0
-    ReplacementText: ",\n                                             const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21861
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1096:98: The right-most dimension of the work-group used in the SYCL kernel that calls this function may be less than \"32\". The function \"dpct::permute_sub_group_by_xor\" may return an unexpected result on the CPU device. Modify the size of the work-group to ensure that the value of the right-most dimension is a multiple of \"32\".\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21874
-    Length:          40
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21946
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21957
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21973
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          21996
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22004
-    Length:          0
-    ReplacementText: ",\n                                              const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22079
-    Length:          3
-    ReplacementText: 'a.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22086
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(), mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22138
-    Length:          3
-    ReplacementText: 'a.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22145
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(), mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22219
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22230
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22275
-    Length:          0
-    ReplacementText: ",\n                                             const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22342
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1096:97: The right-most dimension of the work-group used in the SYCL kernel that calls this function may be less than \"32\". The function \"dpct::permute_sub_group_by_xor\" may return an unexpected result on the CPU device. Modify the size of the work-group to ensure that the value of the right-most dimension is a multiple of \"32\".\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22354
-    Length:          50
-    ReplacementText: 'sycl::fmax(x, dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22436
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22447
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22535
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22546
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22635
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22646
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22735
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22746
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          22937
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23205
-    Length:          0
-    ReplacementText: ",\n        const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23229
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23240
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23253
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23286
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23297
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23310
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23344
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23355
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23368
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23408
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23419
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23432
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23935
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          23946
-    Length:          9
-    ReplacementText: 'item_ct1.get_group_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          24206
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          24482
-    Length:          0
-    ReplacementText: ",\n        const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          24505
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          24516
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          24529
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          25255
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          25436
-    Length:          0
-    ReplacementText: ', const sycl::nd_item<3> &item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          25458
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          25471
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          25484
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          25849
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          25915
-    Length:          0
-    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26051
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26062
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26075
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26180
-    Length:          51
-    ReplacementText: 'sycl::tanh(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26244
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26310
-    Length:          0
-    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26332
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26343
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26356
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26438
-    Length:          11
-    ReplacementText: 'sycl::native::exp(-x[i])'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26462
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26526
-    Length:          0
-    ReplacementText: ",\n                           const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26592
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26603
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26616
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26705
-    Length:          28
-    ReplacementText: 'sycl::native::exp(GELU_QUICK_COEF * x[i])'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26747
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26805
-    Length:          0
-    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26828
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26839
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26852
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26918
-    Length:          11
-    ReplacementText: 'sycl::tanh((float)(x[i]))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          26941
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27007
-    Length:          0
-    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27029
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27040
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27053
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27120
-    Length:          14
-    ReplacementText: 'sycl::fmax((float)(x[i]), (float)0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27146
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27244
-    Length:          0
-    ReplacementText: ",\n                           const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27267
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27278
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27291
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27357
-    Length:          14
-    ReplacementText: 'sycl::fmax((float)(x[i]), (float)0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27374
-    Length:          17
-    ReplacementText: 'sycl::fmin((float)(x[i]), 0.0f)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27420
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27485
-    Length:          0
-    ReplacementText: ",\n                    const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27507
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27518
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27531
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27647
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27734
-    Length:          0
-    ReplacementText: ",\n                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27758
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27769
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27782
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27815
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27833
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27851
-    Length:          21
-    ReplacementText: 'sycl::float2(0.f, 0.f)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          27986
-    Length:          10
-    ReplacementText: 'mean_var.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28012
-    Length:          10
-    ReplacementText: 'mean_var.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28108
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28153
-    Length:          28
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28204
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28251
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28353
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1118:0: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28361
-    Length:          15
-    ReplacementText: 'item_ct1.barrier(sycl::access::fence_space::local_space)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28456
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28489
-    Length:          10
-    ReplacementText: 'mean_var.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28531
-    Length:          10
-    ReplacementText: 'mean_var.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28591
-    Length:          17
-    ReplacementText: 'sycl::rsqrt(var + eps)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28755
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28856
-    Length:          0
-    ReplacementText: ",\n                       const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28875
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28889
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          28902
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29020
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29047
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29066
-    Length:          9
-    ReplacementText: 'item_ct1.get_group_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29085
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29170
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29201
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29220
-    Length:          9
-    ReplacementText: 'item_ct1.get_group_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29345
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29377
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29405
-    Length:          9
-    ReplacementText: 'item_ct1.get_group_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29477
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29588
-    Length:          0
-    ReplacementText: ",\n                        const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29642
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29656
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29669
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29792
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29883
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29946
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29973
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          29992
-    Length:          9
-    ReplacementText: 'item_ct1.get_group_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30050
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30164
-    Length:          0
-    ReplacementText: ",\n                    const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30183
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30197
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30210
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30329
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30356
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30375
-    Length:          9
-    ReplacementText: 'item_ct1.get_group_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30409
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30430
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30507
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30539
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30697
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30818
-    Length:          0
-    ReplacementText: ",\n                           const sycl::nd_item<3> &item_ct1, float *s_sum"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30838
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          30911
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31155
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31200
-    Length:          27
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31250
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31297
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31394
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1118:1: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31402
-    Length:          15
-    ReplacementText: "/*\n        DPCT1065:54: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31482
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31709
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31754
-    Length:          27
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31804
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31851
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31948
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1118:2: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          31956
-    Length:          15
-    ReplacementText: "/*\n        DPCT1065:55: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32036
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32103
-    Length:          22
-    ReplacementText: 'sycl::rsqrt(variance + eps)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32246
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32337
-    Length:          0
-    ReplacementText: ",\n                         const sycl::nd_item<3> &item_ct1, float *s_sum"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32361
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32372
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32385
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32418
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32679
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32724
-    Length:          27
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32774
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32821
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32918
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1118:3: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          32926
-    Length:          15
-    ReplacementText: 'item_ct1.barrier(sycl::access::fence_space::local_space)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33006
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33076
-    Length:          18
-    ReplacementText: 'sycl::rsqrt(mean + eps)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33230
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33241
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33462
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33483
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33593
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33600
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33621
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33628
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33679
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33690
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33861
-    Length:          20
-    ReplacementText: 'x[ib].dm[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33904
-    Length:          21
-    ReplacementText: 'x[ib].dm[1]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33968
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          33989
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34093
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34100
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34118
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34125
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34173
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34184
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34537
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34579
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34713
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34720
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34742
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34749
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34801
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34812
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          34983
-    Length:          20
-    ReplacementText: 'x[ib].dm[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35026
-    Length:          21
-    ReplacementText: 'x[ib].dm[1]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35222
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35264
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35390
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35397
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35415
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35422
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35470
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35481
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35666
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35695
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35780
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35794
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          35910
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          36001
-    Length:          0
-    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          36026
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          36111
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          36329
-    Length:          19
-    ReplacementText: 'x[i].dm[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          36367
-    Length:          20
-    ReplacementText: 'x[i].dm[1]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          37268
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          37359
-    Length:          0
-    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          37382
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          37481
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          37577
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          39257
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          39566
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          39657
-    Length:          0
-    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          39732
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          39806
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          39991
-    Length:          19
-    ReplacementText: 'x[i].dm[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          40035
-    Length:          20
-    ReplacementText: 'x[i].dm[1]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          40869
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          40960
-    Length:          0
-    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          41035
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          41159
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          41386
-    Length:          19
-    ReplacementText: 'x[i].dm[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          41430
-    Length:          20
-    ReplacementText: 'x[i].dm[1]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          42569
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          42660
-    Length:          0
-    ReplacementText: ",\n                                  const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          42735
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          42859
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          44069
-    Length:          0
-    ReplacementText: "/*\nDPCT1110:4: The total declared local variable size in device function dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          44076
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          44233
-    Length:          0
-    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          44360
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          44371
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          44384
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          44669
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          44746
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          45550
-    Length:          19
-    ReplacementText: 'x[i].dm[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          45598
-    Length:          20
-    ReplacementText: 'x[i].dm[1]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          47936
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          47995
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          48048
-    Length:          0
-    ReplacementText: "/*\nDPCT1110:5: The total declared local variable size in device function dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          48055
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          48212
-    Length:          0
-    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          48237
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          48248
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          48261
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          48620
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          48697
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          52351
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          52410
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          52463
-    Length:          0
-    ReplacementText: "/*\nDPCT1110:6: The total declared local variable size in device function dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          52470
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          52627
-    Length:          0
-    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          52652
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          52663
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          52676
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          53015
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          53092
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          54147
-    Length:          19
-    ReplacementText: 'x[i].dm[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          54195
-    Length:          20
-    ReplacementText: 'x[i].dm[1]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          54792
-    Length:          6
-    ReplacementText: 'sycl::float4'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          54899
-    Length:          3
-    ReplacementText: 's.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          54923
-    Length:          3
-    ReplacementText: 's.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          54963
-    Length:          3
-    ReplacementText: 's.z()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          54987
-    Length:          3
-    ReplacementText: 's.w()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          55137
-    Length:          3
-    ReplacementText: 's.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          55151
-    Length:          3
-    ReplacementText: 's.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          55176
-    Length:          3
-    ReplacementText: 's.z()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          55190
-    Length:          3
-    ReplacementText: 's.w()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          57141
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          57245
-    Length:          0
-    ReplacementText: "/*\nDPCT1110:7: The total declared local variable size in device function dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          57252
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          57398
-    Length:          0
-    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          57423
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          57786
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          57832
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          58662
-    Length:          19
-    ReplacementText: 'x[i].dm[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          58710
-    Length:          20
-    ReplacementText: 'x[i].dm[1]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          59016
-    Length:          6
-    ReplacementText: 'sycl::float4'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          59509
-    Length:          5
-    ReplacementText: 'sum.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          59666
-    Length:          5
-    ReplacementText: 'sum.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          59823
-    Length:          5
-    ReplacementText: 'sum.z()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          59980
-    Length:          5
-    ReplacementText: 'sum.w()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          60317
-    Length:          5
-    ReplacementText: 'sum.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          60333
-    Length:          5
-    ReplacementText: 'sum.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          60349
-    Length:          5
-    ReplacementText: 'sum.z()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          60365
-    Length:          5
-    ReplacementText: 'sum.w()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          61535
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          61594
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          61654
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          61811
-    Length:          0
-    ReplacementText: ",\n                                        const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          61938
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          61949
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          61962
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          62194
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          62271
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          65877
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          65988
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66086
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66104
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66181
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66208
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66241
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66436
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66463
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66496
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66613
-    Length:          0
-    ReplacementText: ",\n                          const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66636
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66647
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66660
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66743
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66754
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          66767
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          67043
-    Length:          9
-    ReplacementText: 'sycl::fabs((float)xi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          67153
-    Length:          56
-    ReplacementText: 'sycl::fmax(amax, dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), amax, mask))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          67226
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          67349
-    Length:          14
-    ReplacementText: 'sycl::round(xi / d)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          67453
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          67460
-    Length:          10
-    ReplacementText: 'y[ib].ds.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          67498
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          67505
-    Length:          10
-    ReplacementText: 'y[ib].ds.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          67614
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68030
-    Length:          0
-    ReplacementText: ",\n            const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68072
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68083
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68096
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68132
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68143
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68156
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68190
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68201
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68214
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68254
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68265
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68278
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68848
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68890
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          68947
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69371
-    Length:          0
-    ReplacementText: ",\n            const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69412
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69423
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69436
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69469
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69480
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69493
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69527
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69538
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69551
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69591
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69602
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          69615
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70018
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70116
-    Length:          0
-    ReplacementText: ",\n                             const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70138
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70149
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70164
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70509
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70545
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70789
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70800
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70904
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          70924
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71239
-    Length:          27
-    ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71283
-    Length:          27
-    ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71329
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71343
-    Length:          19
-    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71454
-    Length:          6
-    ReplacementText: 'ds8f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71479
-    Length:          6
-    ReplacementText: 'ds8f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71637
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71648
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71734
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71753
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          71773
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72088
-    Length:          27
-    ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72132
-    Length:          27
-    ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72323
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72337
-    Length:          19
-    ReplacementText: 'dm4.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72368
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72382
-    Length:          19
-    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72426
-    Length:          6
-    ReplacementText: 'dm4f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72435
-    Length:          6
-    ReplacementText: 'ds8f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72466
-    Length:          6
-    ReplacementText: 'dm4f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72475
-    Length:          6
-    ReplacementText: 'ds8f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72814
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72825
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72946
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          72966
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          73452
-    Length:          27
-    ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          73861
-    Length:          27
-    ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          73947
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          73961
-    Length:          19
-    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          74073
-    Length:          6
-    ReplacementText: 'ds8f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          74099
-    Length:          6
-    ReplacementText: 'ds8f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          74257
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          74268
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          74371
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          74390
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          74410
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          74896
-    Length:          27
-    ReplacementText: 'dpct::dp4a(vi0, u[2*i+0], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          75305
-    Length:          27
-    ReplacementText: 'dpct::dp4a(vi1, u[2*i+1], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          75536
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          75550
-    Length:          19
-    ReplacementText: 'dm5.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          75581
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          75595
-    Length:          19
-    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          75639
-    Length:          6
-    ReplacementText: 'dm5f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          75648
-    Length:          6
-    ReplacementText: 'ds8f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          75679
-    Length:          6
-    ReplacementText: 'dm5f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          75688
-    Length:          6
-    ReplacementText: 'ds8f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76007
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76018
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76145
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76359
-    Length:          24
-    ReplacementText: 'dpct::dp4a(v[i], u[i], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76511
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76522
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76608
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76627
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76647
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          76861
-    Length:          24
-    ReplacementText: 'dpct::dp4a(v[i], u[i], sumi)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77049
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77063
-    Length:          19
-    ReplacementText: 'dm8.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77094
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77108
-    Length:          19
-    ReplacementText: 'ds8.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77152
-    Length:          6
-    ReplacementText: 'dm8f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77161
-    Length:          6
-    ReplacementText: 'ds8f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77192
-    Length:          6
-    ReplacementText: 'dm8f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77201
-    Length:          6
-    ReplacementText: 'ds8f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77525
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77536
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77681
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          77732
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78032
-    Length:          19
-    ReplacementText: 'dpct::dp4a(vi, u[i], 0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78213
-    Length:          18
-    ReplacementText: 'dpct::dp4a(m, u[i], 0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78305
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78319
-    Length:          19
-    ReplacementText: 'dm2.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78352
-    Length:          6
-    ReplacementText: 'dm2f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78368
-    Length:          6
-    ReplacementText: 'dm2f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78479
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78490
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78647
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          78685
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          79137
-    Length:          29
-    ReplacementText: 'dpct::dp4a(v[i], u[i], sumi_d_sc)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          79212
-    Length:          26
-    ReplacementText: 'dpct::dp4a(m, u[i], sumi_m)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          79348
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          79362
-    Length:          19
-    ReplacementText: 'dm2.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          79401
-    Length:          6
-    ReplacementText: 'dm2f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          79417
-    Length:          6
-    ReplacementText: 'dm2f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          79588
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          79599
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          79837
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          80584
-    Length:          19
-    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          80631
-    Length:          19
-    ReplacementText: 'dpct::dp4a(vi, u[i], 0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          80803
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          80814
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          81007
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          81288
-    Length:          27
-    ReplacementText: 'dpct::dp4a(v[i], u[i], sumi_sc)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          81585
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          81596
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          81782
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          81833
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82152
-    Length:          47
-    ReplacementText: 'dpct::dp4a(v1i, u[2*i+1], dpct::dp4a(v0i, u[2*i+0], 0))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82246
-    Length:          61
-    ReplacementText: 'dpct::dp4a(0x01010101, u[2*i+1], dpct::dp4a(0x01010101, u[2*i+0], 0))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82481
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82495
-    Length:          19
-    ReplacementText: 'dm4.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82528
-    Length:          6
-    ReplacementText: 'dm4f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82544
-    Length:          6
-    ReplacementText: 'dm4f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82656
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82667
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82852
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82871
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          82904
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83216
-    Length:          60
-    ReplacementText: 'dpct::dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83323
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83337
-    Length:          22
-    ReplacementText: 'ds8[i].convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83380
-    Length:          6
-    ReplacementText: 'ds8f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83425
-    Length:          6
-    ReplacementText: 'ds8f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83495
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83509
-    Length:          19
-    ReplacementText: 'dm4.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83542
-    Length:          6
-    ReplacementText: 'dm4f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83558
-    Length:          6
-    ReplacementText: 'dm4f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83729
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83740
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          83956
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          84007
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          84524
-    Length:          47
-    ReplacementText: 'dpct::dp4a(v0i, u[2*i+0], dpct::dp4a(v1i, u[2*i+1], 0))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          84618
-    Length:          61
-    ReplacementText: 'dpct::dp4a(0x01010101, u[2*i+0], dpct::dp4a(0x01010101, u[2*i+1], 0))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          84795
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          84809
-    Length:          19
-    ReplacementText: 'dm5.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          84842
-    Length:          6
-    ReplacementText: 'dm5f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          84858
-    Length:          6
-    ReplacementText: 'dm5f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          84970
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          84981
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85166
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85185
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85218
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85530
-    Length:          46
-    ReplacementText: 'dpct::dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85623
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85637
-    Length:          22
-    ReplacementText: 'ds8[i].convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85680
-    Length:          6
-    ReplacementText: 'ds8f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85725
-    Length:          6
-    ReplacementText: 'ds8f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85795
-    Length:          6
-    ReplacementText: 'sycl::float2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85809
-    Length:          19
-    ReplacementText: 'dm4.convert<float, sycl::rounding_mode::automatic>()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85842
-    Length:          6
-    ReplacementText: 'dm4f.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          85858
-    Length:          6
-    ReplacementText: 'dm4f.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          86029
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          86040
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          86250
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          86581
-    Length:          34
-    ReplacementText: 'dpct::vectorized_binary<sycl::char4>((vil | vih), 0x20202020, dpct::sub_sat())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          86668
-    Length:          19
-    ReplacementText: 'dpct::dp4a(vi, u[i], 0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          86837
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          86848
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87050
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87236
-    Length:          4
-    ReplacementText: 'sycl::int2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87362
-    Length:          8
-    ReplacementText: 'sumi_d.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87373
-    Length:          36
-    ReplacementText: 'dpct::dp4a(v[2*i+0], u[2*i+0], sumi_d.x())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87443
-    Length:          8
-    ReplacementText: 'sumi_d.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87454
-    Length:          36
-    ReplacementText: 'dpct::dp4a(v[2*i+1], u[2*i+1], sumi_d.x())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87525
-    Length:          8
-    ReplacementText: 'sumi_d.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87536
-    Length:          36
-    ReplacementText: 'dpct::dp4a(v[2*i+4], u[2*i+4], sumi_d.y())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87606
-    Length:          8
-    ReplacementText: 'sumi_d.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87617
-    Length:          36
-    ReplacementText: 'dpct::dp4a(v[2*i+5], u[2*i+5], sumi_d.y())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87727
-    Length:          8
-    ReplacementText: 'sumi_d.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87749
-    Length:          8
-    ReplacementText: 'sumi_d.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87863
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          87874
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          88517
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          88528
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          88582
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          88621
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_qs_q4_0,\n                                                                     float *tile_x_d_q4_0"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          88658
-    Length:          66
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          88729
-    Length:          72
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          88844
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          88928
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          88939
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          89036
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          89674
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          90202
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          90375
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          90386
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          90476
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          90606
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          91301
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          91312
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          91963
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          91974
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          92028
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          92067
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_qs_q4_1,\n                                                                     sycl::half2 *tile_x_dm_q4_1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          92104
-    Length:          67
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          92176
-    Length:          73
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          92367
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          92378
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          92475
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          93077
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          93545
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          93718
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          93729
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          93819
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          93949
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          94595
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          94606
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95360
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95371
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95425
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95464
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_ql_q5_0,\n                                                                     float *tile_x_d_q5_0"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95501
-    Length:          66
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95572
-    Length:          72
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95687
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95771
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95782
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          95879
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          96481
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          96997
-    Length:          26
-    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(qs0, 0x10101010, dpct::sub_sat())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          97388
-    Length:          26
-    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(qs1, 0x10101010, dpct::sub_sat())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          97814
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          97987
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          97998
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          98088
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          98218
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          98999
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          99010
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          99779
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          99790
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          99844
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          99883
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_ql_q5_1,\n                                                                     sycl::half2 *tile_x_dm_q5_1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          99920
-    Length:          67
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          99992
-    Length:          73
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          100183
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          100194
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          100291
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          100892
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          102055
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          102228
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          102239
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          102329
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          102459
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          103145
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          103156
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          103671
-    Length:          21
-    ReplacementText: 'bq8_1->ds[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          103726
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          103737
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          103791
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          103830
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_qs_q8_0,\n                                                                     float *tile_x_d_q8_0"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          103867
-    Length:          66
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          103938
-    Length:          72
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          104053
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          104137
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          104148
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          104245
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          104883
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          105342
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          105515
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          105526
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          105616
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          105746
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          106175
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          106186
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          106823
-    Length:          36
-    ReplacementText: 'bq8_1[bq8_offset + i].ds[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          106968
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          106979
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          107033
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          107072
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_ql_q2_K,\n                                                                     sycl::half2 *tile_x_dm_q2_K,\n                                                                     int *tile_x_sc_q2_K"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          107097
-    Length:          67
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          107169
-    Length:          73
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          107247
-    Length:          69
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          107462
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          107473
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          107570
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          108160
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          108638
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          108968
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          109220
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          109231
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          109321
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          109451
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          110293
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          110304
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111071
-    Length:          36
-    ReplacementText: 'bq8_1[bq8_offset + i].ds[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111234
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111245
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111299
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111338
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_ql_q3_K,\n                                                                     sycl::half2 *tile_x_dm_q3_K,\n                                                                     int *tile_x_qh_q3_K,\n                                                                     int *tile_x_sc_q3_K"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111347
-    Length:          67
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111419
-    Length:          73
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111497
-    Length:          69
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111571
-    Length:          69
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111814
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111825
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          111922
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          112496
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          113002
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          113332
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          113813
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          114376
-    Length:          39
-    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(sc_low | sc_high, 0x20202020, dpct::sub_sat())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          114500
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          114511
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          114601
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          114731
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          115560
-    Length:          19
-    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          115794
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          115805
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          117199
-    Length:          20
-    ReplacementText: 'bq8i->ds[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          118931
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          118942
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          118996
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          119035
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_ql_q4_K,\n                                                                     sycl::half2 *tile_x_dm_q4_K,\n                                                                     int *tile_x_sc_q4_K"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          119060
-    Length:          67
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          119132
-    Length:          73
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          119210
-    Length:          69
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          119425
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          119436
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          119533
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          120169
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          120702
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          121154
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          121766
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          121777
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          121867
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          121997
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          122461
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          122472
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          123675
-    Length:          21
-    ReplacementText: 'bq8i->ds[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          125442
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          125453
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          125507
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          125546
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_ql_q5_K,\n                                                                     sycl::half2 *tile_x_dm_q5_K,\n                                                                     int *tile_x_sc_q5_K"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          125571
-    Length:          67
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          125643
-    Length:          73
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          125721
-    Length:          69
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          125936
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          125947
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          126044
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          126680
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          127820
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          128183
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          128795
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          128806
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          128896
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          129026
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          129550
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          129561
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          130421
-    Length:          38
-    ReplacementText: 'bq8_1[bq8_offset + 2*i].ds[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          130572
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          130583
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          130637
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          130676
-    Length:          0
-    ReplacementText: ",\n                                                                     int *tile_x_ql,\n                                                                     sycl::half2 *tile_x_dm,\n                                                                     int *tile_x_sc"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          130701
-    Length:          62
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          130768
-    Length:          68
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          130841
-    Length:          64
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          131036
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          131047
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          131144
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          131780
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          132523
-    Length:          32
-    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020, dpct::sub_sat())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          132601
-    Length:          32
-    ReplacementText: 'dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020, dpct::sub_sat())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          133031
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          133371
-    Length:          13
-    ReplacementText: 'sycl::min(i, i_max)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          133606
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          133617
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          133707
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          133837
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          134572
-    Length:          0
-    ReplacementText: "/*\nDPCT1110:8: The total declared local variable size in device function mul_mat_q exceeds 128 bytes and may cause high register pressure. Consult with your hardware vendor to find the total register size available and adjust the code, or use smaller sub-group size to avoid high register pressure.\n*/\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          134579
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          134590
-    Length:          15
-    ReplacementText: __dpct_inline__
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          134837
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          134892
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_y_qs, sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          135212
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          135294
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          135354
-    Length:          47
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          135406
-    Length:          53
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          135710
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          135742
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          135875
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          136038
-    Length:          41
-    ReplacementText: 'dpct::min((unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i), ncols_y-1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          136263
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          136383
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          136546
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          136568
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          136642
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          136713
-    Length:          29
-    ReplacementText: 'sycl::min(col_y_0 + ids, ncols_y-1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          136867
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          136984
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          137237
-    Length:          20
-    ReplacementText: '(*dsi_src)[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          137292
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1118:9: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          137304
-    Length:          15
-    ReplacementText: "/*\n            DPCT1065:56: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n            */\n            item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          137814
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          137831
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          137907
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1118:10: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          137919
-    Length:          15
-    ReplacementText: "/*\n            DPCT1065:57: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n            */\n            item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          138058
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          138251
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          138973
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          139462
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0,\n    int *tile_y_qs, sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          139503
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          140376
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          140608
-    Length:          0
-    ReplacementText: ', tile_x_qs_q4_0, tile_x_d_q4_0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          140895
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          140905
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          142059
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          142278
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          142600
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1,\n    sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          142641
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          143513
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          143745
-    Length:          0
-    ReplacementText: ', tile_x_qs_q4_1, tile_x_dm_q4_1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          144032
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          144042
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          145195
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          145684
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0,\n    int *tile_y_qs, sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          145725
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          146597
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          146829
-    Length:          0
-    ReplacementText: ', tile_x_ql_q5_0, tile_x_d_q5_0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          147117
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          147127
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          148281
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          148766
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1,\n    sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          148807
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          149678
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          149910
-    Length:          0
-    ReplacementText: ', tile_x_ql_q5_1, tile_x_dm_q5_1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          150197
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          150207
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          151360
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          151849
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0,\n    int *tile_y_qs, sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          151890
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          152762
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          152994
-    Length:          0
-    ReplacementText: ', tile_x_qs_q8_0, tile_x_d_q8_0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          153282
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          153292
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          154447
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          154932
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K,\n    sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs,\n    sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          154973
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          155844
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          156076
-    Length:          0
-    ReplacementText: ', tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          156363
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          156373
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          157528
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          157747
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          158069
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K,\n    sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K,\n    int *tile_y_qs, sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          158110
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          158981
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          159213
-    Length:          0
-    ReplacementText: ', tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K, tile_x_sc_q3_K'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          159500
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          159510
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          160663
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          160882
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          161204
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K,\n    sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs,\n    sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          161245
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          162115
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          162347
-    Length:          0
-    ReplacementText: ', tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          162633
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          162643
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          163795
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          164280
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K,\n    sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs,\n    sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          164321
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          165191
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          165423
-    Length:          0
-    ReplacementText: ', tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          165709
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          165719
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          166870
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          167089
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          167411
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,\n    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          167452
-    Length:          5
-    ReplacementText: 'sycl::half2'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          168323
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          168555
-    Length:          0
-    ReplacementText: ', tile_x_ql, tile_x_dm, tile_x_sc'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          168842
-    Length:          0
-    ReplacementText: ', item_ct1, tile_y_qs, tile_y_ds'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          168852
-    Length:          13
-    ReplacementText: DPCT_COMPATIBILITY_TEMP
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          169571
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          169719
-    Length:          0
-    ReplacementText: ",\n                          const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          169743
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          169754
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          169767
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          170198
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          170269
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          170378
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          170646
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          170705
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          170830
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          170988
-    Length:          0
-    ReplacementText: ",\n                                   const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          171120
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          171131
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          171144
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          171225
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          172594
-    Length:          3
-    ReplacementText: 'v.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          172645
-    Length:          3
-    ReplacementText: 'v.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          172852
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173048
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173262
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173277
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173295
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173330
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173341
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173354
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173391
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173402
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173415
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173670
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173719
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          173938
-    Length:          19
-    ReplacementText: 'sycl::vec<sycl::half, 1>(x[ix]).convert<float, sycl::rounding_mode::automatic>()[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174346
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174405
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174466
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174741
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174756
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174774
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174813
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174824
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174837
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174876
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174887
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          174900
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          175200
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          175249
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          175513
-    Length:          19
-    ReplacementText: 'sycl::vec<sycl::half, 1>(x[ix]).convert<float, sycl::rounding_mode::automatic>()[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          175696
-    Length:          42
-    ReplacementText: 'dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          175755
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          175816
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          175988
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176100
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176115
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176143
-    Length:          17
-    ReplacementText: 'sycl::vec<float, 1>(*xi).convert<sycl::half, sycl::rounding_mode::automatic>()[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176172
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176246
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176265
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176282
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176297
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176370
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176670
-    Length:          0
-    ReplacementText: ",\n                                   const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176692
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176703
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          176716
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          177406
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          177689
-    Length:          21
-    ReplacementText: 'sycl::fmax(amax, sycl::fabs((float)v))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          177917
-    Length:          10
-    ReplacementText: 'sycl::round((float)x0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          177945
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          178239
-    Length:          8
-    ReplacementText: 'sycl::fabs((float)v)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          178270
-    Length:          8
-    ReplacementText: 'sycl::fabs((float)v)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          178569
-    Length:          28
-    ReplacementText: 'dpct::min(15, (int8_t)(x0 + 8.5f))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          178627
-    Length:          28
-    ReplacementText: 'dpct::min(15, (int8_t)(x1 + 8.5f))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          178735
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          179187
-    Length:          10
-    ReplacementText: 'dsti->dm.x()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          179207
-    Length:          10
-    ReplacementText: 'dsti->dm.y()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          179404
-    Length:          28
-    ReplacementText: 'dpct::min(15, (int8_t)(x0 + 0.5f))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          179462
-    Length:          28
-    ReplacementText: 'dpct::min(15, (int8_t)(x1 + 0.5f))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          179611
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          179905
-    Length:          0
-    ReplacementText: ",\n                                 const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          179928
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          179939
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          179952
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          180465
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          180585
-    Length:          23
-    ReplacementText: 'sycl::max(0.001f, high - low)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          180628
-    Length:          23
-    ReplacementText: 'sycl::min(1.0f, sycl::max(0.0f, y))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          180878
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          181499
-    Length:          23
-    ReplacementText: 'sycl::log(1.0f / freq_scale)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          181547
-    Length:          11
-    ReplacementText: 'sycl::cos(theta)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          181586
-    Length:          11
-    ReplacementText: 'sycl::sin(theta)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          181700
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          181899
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          181926
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          181937
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          181950
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182032
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182043
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182056
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182214
-    Length:          34
-    ReplacementText: 'dpct::pow(freq_base, -float(col)/ncols)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182588
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182823
-    Length:          0
-    ReplacementText: ",\n    const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182850
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182861
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182874
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182956
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182967
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          182980
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          183427
-    Length:          27
-    ReplacementText: 'dpct::pow(theta_scale, col/2.0f)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          183784
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          183947
-    Length:          0
-    ReplacementText: ', const sycl::nd_item<3> &item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          183971
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          183982
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          183995
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184119
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184130
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184143
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184263
-    Length:          32
-    ReplacementText: 'dpct::pow(freq_base, -2.0f*col/ncols)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184406
-    Length:          17
-    ReplacementText: 'sycl::min(p, n_ctx - 2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184480
-    Length:          11
-    ReplacementText: 'sycl::sin((float)theta)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184521
-    Length:          11
-    ReplacementText: 'sycl::cos((float)theta)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184759
-    Length:          21
-    ReplacementText: 'sycl::max(p - n_ctx - 2, 0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184833
-    Length:          17
-    ReplacementText: 'sycl::sin((float)block_theta)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          184886
-    Length:          17
-    ReplacementText: 'sycl::cos((float)block_theta)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185151
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185335
-    Length:          0
-    ReplacementText: ",\n                                 const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185359
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185370
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185383
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185464
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185475
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185488
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185631
-    Length:          15
-    ReplacementText: 'dpct::pow(m0, k + 1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185675
-    Length:          42
-    ReplacementText: 'dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185767
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185843
-    Length:          0
-    ReplacementText: ",\n                           const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185867
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185899
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          185973
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          186058
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          186150
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          186268
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          186345
-    Length:          0
-    ReplacementText: ",\n                              const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          186383
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          186410
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          186624
-    Length:          15
-    ReplacementText: "/*\n    DPCT1065:58: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n    */\n    item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187320
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1118:11: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187332
-    Length:          15
-    ReplacementText: "/*\n            DPCT1065:59: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n            */\n            item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187375
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187500
-    Length:          0
-    ReplacementText: ",\n                              const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187524
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187535
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187548
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187581
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187592
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187605
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          187993
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188122
-    Length:          0
-    ReplacementText: ",\n                         const sycl::nd_item<3> &item_ct1, float *buf"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188147
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188181
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188305
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188342
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188391
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188421
-    Length:          57
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188670
-    Length:          46
-    ReplacementText: 'sycl::max(max_val, x[ix]*scale + (y ? y[iy] : 0.0f))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188801
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188914
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1118:12: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          188922
-    Length:          15
-    ReplacementText: "/*\n        DPCT1065:60: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189014
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1118:13: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189022
-    Length:          15
-    ReplacementText: "/*\n        DPCT1065:61: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189113
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189311
-    Length:          50
-    ReplacementText: 'sycl::native::exp((x[ix]*scale + (y ? y[iy] : 0.0f)) - max_val)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189483
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189590
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1118:14: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189598
-    Length:          15
-    ReplacementText: "/*\n        DPCT1065:62: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189686
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1118:15: SYCL group functions and algorithms must be encountered in converged control flow. You may need to adjust the code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189694
-    Length:          15
-    ReplacementText: "/*\n        DPCT1065:63: Consider replacing sycl::nd_item::barrier() with sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better performance if there is no access to global memory.\n        */\n        item_ct1.barrier()"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189773
-    Length:          0
-    ReplacementText: ', item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          189962
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190048
-    Length:          0
-    ReplacementText: ",\n                      const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190070
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190081
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190094
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190186
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190287
-    Length:          0
-    ReplacementText: ",\n                      const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190309
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190320
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190333
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190458
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190515
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190671
-    Length:          0
-    ReplacementText: ",\n        const sycl::nd_item<3> &item_ct1"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190693
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190707
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          190720
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191022
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191095
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191134
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191257
-    Length:          18
-    ReplacementText: 'sycl::vec<float, 1>(0.0f).convert<sycl::half, sycl::rounding_mode::automatic>()[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191325
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(0)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191378
-    Length:          44
-    ReplacementText: 'sycl::vec<float, 1>(x[offset_src + iih * IW + iiw]).convert<sycl::half, sycl::rounding_mode::automatic>()[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191670
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191738
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191754
-    Length:          30
-    ReplacementText: 1, 1, CUDA_GET_ROWS_BLOCK_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191897
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          191913
-    Length:          28
-    ReplacementText: 'ne11*ne12, ne10, block_num_x'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          192443
-    Length:          294
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) {\n        k_get_rows<qk, qr, dq>(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          192737
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          192985
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          193053
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          193069
-    Length:          30
-    ReplacementText: 1, 1, CUDA_GET_ROWS_BLOCK_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          193206
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          193222
-    Length:          28
-    ReplacementText: 'ne11*ne12, ne10, block_num_x'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          193719
-    Length:          288
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) {\n          k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2, s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          194007
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          194368
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          196599
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          196614
-    Length:          0
-    ReplacementText: '(1, 1, 1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          196628
-    Length:          12
-    ReplacementText: 'block_dims[2]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          196697
-    Length:          12
-    ReplacementText: 'block_dims[1]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          196712
-    Length:          54
-    ReplacementText: 'std::min<unsigned int>(ne1, block_size / (unsigned int)block_dims[2])'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          196780
-    Length:          12
-    ReplacementText: 'block_dims[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          196795
-    Length:          88
-    ReplacementText: 'std::min(std::min<unsigned int>(ne2*ne3, block_size / (unsigned int)block_dims[2] / (unsigned int)block_dims[1]), 64U)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          196898
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          196914
-    Length:          188
-    ReplacementText: '(ne2*ne3 + block_dims[0] - 1) / block_dims[0], (ne1 + block_dims[1] - 1) / block_dims[1], (hne0 + block_dims[2] - 1) / block_dims[2]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          197122
-    Length:          12
-    ReplacementText: 'block_nums[0]'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          197342
-    Length:          284
-    ReplacementText: "{\n                  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n                  stream->parallel_for(\n                    sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) * sycl::range<3>(1, 1, block_size), sycl::range<3>(1, 1, block_size)), \n                    [=](sycl::nd_item<3> item_ct1) {\n                      k_bin_bcast_unravel<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, s13, item_ct1);\n                    });\n                }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          197626
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          197649
-    Length:          0
-    ReplacementText: "                /*\n                DPCT1049:16: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n                */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          197665
-    Length:          277
-    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n                  stream->parallel_for(\n                    sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                    [=](sycl::nd_item<3> item_ct1) {\n                      k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3, ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12, s13, item_ct1);\n                    });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          197942
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198176
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198286
-    Length:          114
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198400
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198474
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198583
-    Length:          68
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        gelu_f32(x, dst, k, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198651
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198725
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198834
-    Length:          68
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        silu_f32(x, dst, k, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198902
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          198982
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199091
-    Length:          74
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        gelu_quick_f32(x, dst, k, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199165
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199239
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199348
-    Length:          68
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        tanh_f32(x, dst, k, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199416
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199490
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199599
-    Length:          68
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        relu_f32(x, dst, k, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199667
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199775
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199884
-    Length:          90
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        leaky_relu_f32(x, dst, k, negative_slope, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          199974
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200047
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200154
-    Length:          66
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        sqr_f32(x, dst, k, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200220
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200332
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200434
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200450
-    Length:          15
-    ReplacementText: 1, 1, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200476
-    Length:          73
-    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200549
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200578
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200594
-    Length:          10
-    ReplacementText: 1, 1, 1024
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200607
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:17: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200615
-    Length:          68
-    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                norm_f32<1024>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200683
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200823
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200925
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200941
-    Length:          15
-    ReplacementText: 1, 1, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          200967
-    Length:          102
-    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            const float eps_ct4 = eps;\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                group_norm_f32<WARP_SIZE>(x, dst, group_size, ne_elements, eps_ct4, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201069
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201098
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201114
-    Length:          10
-    ReplacementText: 1, 1, 1024
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201127
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:18: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201135
-    Length:          97
-    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            const float eps_ct4 = eps;\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                group_norm_f32<1024>(x, dst, group_size, ne_elements, eps_ct4, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201232
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201361
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201470
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201483
-    Length:          20
-    ReplacementText: ne2, ne1, num_blocks
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201510
-    Length:          80
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        concat_f32(x, y, dst, ne0, ne02, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201590
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201726
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201874
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201887
-    Length:          39
-    ReplacementText: 'ne02, (ne01 * scale_factor), num_blocks'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          201933
-    Length:          101
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202034
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202195
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202298
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202311
-    Length:          20
-    ReplacementText: ne2, ne1, num_blocks
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202338
-    Length:          83
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202421
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202537
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202639
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202655
-    Length:          15
-    ReplacementText: 1, 1, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202681
-    Length:          77
-    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                rms_norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202758
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202787
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202803
-    Length:          10
-    ReplacementText: 1, 1, 1024
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202816
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:19: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202824
-    Length:          72
-    ReplacementText: "stream->submit(\n          [&](sycl::handler &cgh) {\n            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32), cgh);\n\n            cgh.parallel_for(\n              sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n              [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n                rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1, s_sum_acc_ct1.get_pointer());\n              });\n          });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          202896
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203019
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203151
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203167
-    Length:          18
-    ReplacementText: 1, ky, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203198
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203214
-    Length:          32
-    ReplacementText: 1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203253
-    Length:          74
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(num_blocks * block_size, block_size), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          quantize_q8_1(x, vy, kx, kx_padded, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203327
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203514
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203635
-    Length:          108
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203743
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203851
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203923
-    Length:          51
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q2_K(vx, y, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          203974
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204152
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204224
-    Length:          51
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q3_K(vx, y, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204275
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204453
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204509
-    Length:          51
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 32), sycl::range<3>(1, 1, 32)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q4_K(vx, y, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204560
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204668
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204740
-    Length:          51
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q5_K(vx, y, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204791
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          204969
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          205041
-    Length:          51
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, nb) * sycl::range<3>(1, 1, 64), sycl::range<3>(1, 1, 64)), \n        [=](sycl::nd_item<3> item_ct1) {\n          dequantize_block_q6_K(vx, y, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          205092
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          207504
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          207771
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          207787
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          207817
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          207833
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          207869
-    Length:          127
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          207996
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208128
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208286
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208302
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208332
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208348
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208384
-    Length:          127
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208511
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208643
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208801
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208817
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208847
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208863
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          208899
-    Length:          127
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209026
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209158
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209316
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209332
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209362
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209378
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209414
-    Length:          127
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209541
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209673
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209831
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209847
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209877
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209893
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          209929
-    Length:          127
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210056
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210187
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210397
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210413
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210443
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210459
-    Length:          9
-    ReplacementText: 1, ny, 32
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210475
-    Length:          92
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210567
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210698
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210865
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210881
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210911
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210927
-    Length:          9
-    ReplacementText: 1, ny, 32
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          210943
-    Length:          92
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211035
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211166
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211333
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211349
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211379
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211395
-    Length:          9
-    ReplacementText: 1, ny, 32
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211411
-    Length:          92
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211503
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211634
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211703
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211719
-    Length:          8
-    ReplacementText: 1, 1, 32
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211734
-    Length:          80
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211814
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          211945
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212112
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212128
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212158
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212174
-    Length:          9
-    ReplacementText: 1, ny, 32
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212190
-    Length:          92
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212282
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212410
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212568
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212584
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212614
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212630
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212666
-    Length:          115
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols, nrows, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212781
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          212906
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213053
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213069
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213099
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213115
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213151
-    Length:          153
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ, vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213304
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213429
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213576
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213592
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213622
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213638
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213674
-    Length:          153
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ, vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213827
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          213952
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214099
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214115
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214145
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214161
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214197
-    Length:          153
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ, vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214350
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214475
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214622
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214638
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214668
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214684
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214720
-    Length:          153
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ, vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214873
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          214998
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215145
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215161
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215191
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215207
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215243
-    Length:          153
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ, vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215396
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215521
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215667
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215683
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215713
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215729
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215765
-    Length:          152
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ, vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          215917
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216042
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216188
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216204
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216234
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216250
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216286
-    Length:          152
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ, vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216438
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216563
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216709
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216725
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216755
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216771
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216807
-    Length:          152
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ, vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          216959
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217084
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217230
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217246
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217276
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217292
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217328
-    Length:          152
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ, vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217480
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217605
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217751
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217767
-    Length:          17
-    ReplacementText: 1, 1, block_num_y
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217797
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217813
-    Length:          29
-    ReplacementText: 1, GGML_CUDA_MMV_Y, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          217849
-    Length:          152
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ, vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          218001
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          218199
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          218220
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          218250
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219170
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219186
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219226
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219242
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219337
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:20: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219345
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_0_acc_ct1.get_pointer(), tile_x_d_q4_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219481
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219534
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:21: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219542
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_0_acc_ct1.get_pointer(), tile_x_d_q4_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219678
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219687
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219882
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219903
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          219933
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          220853
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          220869
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          220909
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          220925
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221020
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:22: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221028
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) +     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_1_acc_ct1.get_pointer(), tile_x_dm_q4_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221164
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221217
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:23: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221225
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE) +     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q4_1_acc_ct1.get_pointer(), tile_x_dm_q4_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221361
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221370
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221565
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221586
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          221616
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          222536
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          222552
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          222592
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          222608
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          222703
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:24: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          222711
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_0_acc_ct1.get_pointer(), tile_x_d_q5_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          222847
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          222900
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:25: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          222908
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_0_acc_ct1.get_pointer(), tile_x_d_q5_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          223044
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          223053
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          223248
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          223269
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          223299
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224219
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224235
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224275
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224291
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224386
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224394
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_1_acc_ct1.get_pointer(), tile_x_dm_q5_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224530
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224583
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224591
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_1_acc_ct1.get_pointer(), tile_x_dm_q5_1_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224727
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224736
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224931
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224952
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          224982
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          225902
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          225918
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          225958
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          225974
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226069
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226077
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q8_0_acc_ct1.get_pointer(), tile_x_d_q8_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226213
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226266
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:29: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226274
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_qs_q8_0_acc_ct1.get_pointer(), tile_x_d_q8_0_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226410
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226419
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226614
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226635
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          226665
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          227585
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          227601
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          227641
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          227657
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          227752
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:30: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          227760
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4)     + mmq_y/4), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q2_K_acc_ct1.get_pointer(), tile_x_dm_q2_K_acc_ct1.get_pointer(), tile_x_sc_q2_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          227896
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          227949
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:31: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          227957
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4)     + mmq_y/4), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q2_K_acc_ct1.get_pointer(), tile_x_dm_q2_K_acc_ct1.get_pointer(), tile_x_sc_q2_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          228093
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          228102
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          228297
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          228318
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          228365
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229285
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229301
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229341
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229357
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229452
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229460
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/2)     + mmq_y/2), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4)     + mmq_y/4), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q3_K_acc_ct1.get_pointer(), tile_x_dm_q3_K_acc_ct1.get_pointer(), tile_x_qh_q3_K_acc_ct1.get_pointer(), tile_x_sc_q3_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229596
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229649
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229657
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI3_K) + mmq_y/QI3_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/2)     + mmq_y/2), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/4)     + mmq_y/4), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q3_K_acc_ct1.get_pointer(), tile_x_dm_q3_K_acc_ct1.get_pointer(), tile_x_qh_q3_K_acc_ct1.get_pointer(), tile_x_sc_q3_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229793
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          229809
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          230004
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          230025
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          230055
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          230975
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          230991
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231031
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231047
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231142
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231150
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q4_K_acc_ct1.get_pointer(), tile_x_dm_q4_K_acc_ct1.get_pointer(), tile_x_sc_q4_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231286
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231339
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231347
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE)       + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q4_K_acc_ct1.get_pointer(), tile_x_dm_q4_K_acc_ct1.get_pointer(), tile_x_sc_q4_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231483
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231492
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231687
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231708
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          231738
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          232658
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          232674
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          232714
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          232730
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          232825
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          232833
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_K_acc_ct1.get_pointer(), tile_x_dm_q5_K_acc_ct1.get_pointer(), tile_x_sc_q5_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          232969
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          233022
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          233030
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_q5_K_acc_ct1.get_pointer(), tile_x_dm_q5_K_acc_ct1.get_pointer(), tile_x_sc_q5_K_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          233166
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          233175
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          233370
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          233391
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          233421
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234341
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234357
-    Length:          27
-    ReplacementText: 1, block_num_y, block_num_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234397
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234413
-    Length:          20
-    ReplacementText: 1, nwarps, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234508
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234516
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_acc_ct1.get_pointer(), tile_x_dm_acc_ct1.get_pointer(), tile_x_sc_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234652
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234705
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234713
-    Length:          136
-    ReplacementText: "{\n          dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->submit(\n            [&](sycl::handler &cgh) {\n              sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(sycl::range<1>(mmq_y * (2*WARP_SIZE)     + mmq_y), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K), cgh);\n              sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(sycl::range<1>(mmq_y * (WARP_SIZE/8)     + mmq_y/8), cgh);\n              sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE), cgh);\n              sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(sycl::range<1>(mmq_x * WARP_SIZE/QI8_1), cgh);\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_nums * block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, item_ct1, tile_x_ql_acc_ct1.get_pointer(), tile_x_dm_acc_ct1.get_pointer(), tile_x_sc_acc_ct1.get_pointer(), tile_y_qs_acc_ct1.get_pointer(), tile_y_ds_acc_ct1.get_pointer());\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234849
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          234858
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235043
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235077
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235093
-    Length:          23
-    ReplacementText: nchannels_y, nrows_x, 1
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235129
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235145
-    Length:          15
-    ReplacementText: 1, 1, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235167
-    Length:          115
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235282
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235524
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235558
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235574
-    Length:          23
-    ReplacementText: nchannels_y, nrows_x, 1
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235610
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235626
-    Length:          15
-    ReplacementText: 1, 1, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235648
-    Length:          157
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * block_dims, block_dims), \n        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n          mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          235805
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          236061
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          236170
-    Length:          157
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          236327
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          236583
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          236692
-    Length:          157
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          236849
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          237106
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          237207
-    Length:          148
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), \n      [=](sycl::nd_item<3> item_ct1) {\n        cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          237355
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          237612
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          237713
-    Length:          148
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), \n      [=](sycl::nd_item<3> item_ct1) {\n        cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          237861
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          238118
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          238219
-    Length:          148
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks), sycl::range<3>(1, 1, 1)), \n      [=](sycl::nd_item<3> item_ct1) {\n        cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          238367
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          238623
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          238732
-    Length:          157
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          238889
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          238983
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239094
-    Length:          77
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        scale_f32(x, dst, scale, k, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239171
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239280
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239391
-    Length:          80
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)), \n      [=](sycl::nd_item<3> item_ct1) {\n        clamp_f32(x, dst, min, max, k, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239471
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239709
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239776
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239925
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239941
-    Length:          22
-    ReplacementText: 1, num_blocks_x, nrows
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          239992
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          240000
-    Length:          168
-    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->parallel_for(\n            sycl::nd_range<3>(block_nums * block_dims, block_dims), \n            [=](sycl::nd_item<3> item_ct1) {\n              rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, item_ct1);\n            });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          240168
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          240183
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          240191
-    Length:          167
-    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->parallel_for(\n            sycl::nd_range<3>(block_nums * block_dims, block_dims), \n            [=](sycl::nd_item<3> item_ct1) {\n              rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, ext_factor, attn_factor, corr_dims, item_ct1);\n            });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          240358
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          240619
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          240686
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          240835
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          240851
-    Length:          22
-    ReplacementText: 1, num_blocks_x, nrows
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241009
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241017
-    Length:          206
-    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->parallel_for(\n            sycl::nd_range<3>(block_nums * block_dims, block_dims), \n            [=](sycl::nd_item<3> item_ct1) {\n              rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, theta_scale, inv_ndims, item_ct1);\n            });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241223
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241238
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241246
-    Length:          205
-    ReplacementText: "  dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n          stream->parallel_for(\n            sycl::nd_range<3>(block_nums * block_dims, block_dims), \n            [=](sycl::nd_item<3> item_ct1) {\n              rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale, p_delta_rows, ext_factor, attn_factor, corr_dims, theta_scale, inv_ndims, item_ct1);\n            });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241451
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241638
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241705
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241721
-    Length:          28
-    ReplacementText: '1, 1, CUDA_ROPE_BLOCK_SIZE/4'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241850
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241866
-    Length:          22
-    ReplacementText: 1, nrows, num_blocks_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          241895
-    Length:          115
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) {\n        rope_glm_f32(x, dst, ncols, pos, freq_scale, p_delta_rows, freq_base, n_ctx, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242010
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242240
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242273
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242289
-    Length:          27
-    ReplacementText: 1, 1, CUDA_ALIBI_BLOCK_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242421
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242437
-    Length:          22
-    ReplacementText: 1, nrows, num_blocks_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242466
-    Length:          99
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) {\n        alibi_f32(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242565
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242664
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242697
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242713
-    Length:          15
-    ReplacementText: 1, 1, WARP_SIZE
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242741
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242775
-    Length:          68
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n        k_sum_rows_f32(x, dst, ncols, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242843
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          242966
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243097
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243113
-    Length:          11
-    ReplacementText: 1, 1, ncols
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243137
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243201
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243209
-    Length:          86
-    ReplacementText: "stream->parallel_for(\n          sycl::nd_range<3>(block_nums * block_dims, block_dims), \n          [=](sycl::nd_item<3> item_ct1) {\n            k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);\n          });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243295
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243339
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243347
-    Length:          87
-    ReplacementText: "stream->parallel_for(\n          sycl::nd_range<3>(block_nums * block_dims, block_dims), \n          [=](sycl::nd_item<3> item_ct1) {\n            k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);\n          });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243434
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243635
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243668
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243839
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243855
-    Length:          23
-    ReplacementText: 1, block_num_x, nrows_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243885
-    Length:          99
-    ReplacementText: "stream->parallel_for(\n      sycl::nd_range<3>(block_nums * block_dims, block_dims), \n      [=](sycl::nd_item<3> item_ct1) {\n        diag_mask_inf_f32(x, dst, ncols_x, rows_per_channel, n_past, item_ct1);\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          243984
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244142
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244270
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244286
-    Length:          13
-    ReplacementText: 1, 1, nth
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244312
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244328
-    Length:          13
-    ReplacementText: 1, 1, nrows_x
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244344
-    Length:          0
-    ReplacementText: "    /*\n    DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n    */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244348
-    Length:          87
-    ReplacementText: "stream->submit(\n      [&](sycl::handler &cgh) {\n        /*\n        DPCT1101:96: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was replaced with a value. Modify the code to use the original expression, provided in comments, if it is correct.\n        */\n        sycl::local_accessor<float, 1> buf_acc_ct1(sycl::range<1>(32/*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);\n\n        cgh.parallel_for(\n          sycl::nd_range<3>(block_nums * block_dims, block_dims), \n          [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {\n            soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1, buf_acc_ct1.get_pointer());\n          });\n      });"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244435
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244488
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244628
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244805
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244821
-    Length:          18
-    ReplacementText: IC, OH, num_blocks
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          244846
-    Length:          166
-    ReplacementText: "{\n      dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16});\n\n      stream->parallel_for(\n        sycl::nd_range<3>(block_nums * sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE), sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)), \n        [=](sycl::nd_item<3> item_ct1) {\n          im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1, item_ct1);\n        });\n    }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          245012
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          245872
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          245946
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          247185
-    Length:          43
-    ReplacementText: 'DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(look_ahead_size, dpct::get_in_order_queue()))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          247595
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          247658
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          247732
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          248074
-    Length:          13
-    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          248087
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          248125
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          248186
-    Length:          0
-    ReplacementText: "/*\nDPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.\n*/\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          248286
-    Length:          11
-    ReplacementText: 'dpct::device_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          248548
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          248622
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          249332
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1082:65: Migration of CUmemAllocationProp type is not supported.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          249511
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1082:66: Migration of CUmemGenericAllocationHandle type is not supported.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          249556
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1007:69: Migration of cuMemCreate is not supported.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          249729
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1007:70: Migration of cuMemAddressReserve is not supported.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          249877
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1007:71: Migration of cuMemMap is not supported.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          250001
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1082:72: Migration of CUmemAccessDesc type is not supported.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          250190
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1007:73: Migration of cuMemSetAccess is not supported.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          250981
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          251044
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          251118
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          251451
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          251524
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          251553
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          251749
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          251808
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          251837
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          251999
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          253084
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          253419
-    Length:          35
-    ReplacementText: 'DPCT_CHECK_ERROR(g_device_count = dpct::dev_mgr::instance().device_count())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          253458
-    Length:          11
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          254270
-    Length:          8
-    ReplacementText: int
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          254308
-    Length:          24
-    ReplacementText: 'DPCT_CHECK_ERROR(device = id)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          254335
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1028:74: The cuDeviceGetAttribute was not migrated because parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is unsupported.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          254488
-    Length:          0
-    ReplacementText: "                /*\n                DPCT1082:75: Migration of CUmemAllocationProp type is not supported.\n                */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          254723
-    Length:          0
-    ReplacementText: "                /*\n                DPCT1007:76: Migration of cuMemGetAllocationGranularity is not supported.\n                */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          254977
-    Length:          14
-    ReplacementText: 'dpct::device_info'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255021
-    Length:          23
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_device_info'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255045
-    Length:          5
-    ReplacementText: prop
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255052
-    Length:          2
-    ReplacementText: 'dpct::dev_mgr::instance().get_device(id)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255055
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255058
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1005:77: The SYCL device version is different from CUDA Compute Compatibility. You may need to rewrite this code.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255151
-    Length:          4
-    ReplacementText: 'get_name()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255162
-    Length:          5
-    ReplacementText: 'get_major_version()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255174
-    Length:          5
-    ReplacementText: 'get_minor_version()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255286
-    Length:          14
-    ReplacementText: 'get_global_mem_size()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255454
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1005:78: The SYCL device version is different from CUDA Compute Compatibility. You may need to rewrite this code.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255498
-    Length:          5
-    ReplacementText: 'get_major_version()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255514
-    Length:          5
-    ReplacementText: 'get_minor_version()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255906
-    Length:          0
-    ReplacementText: "                /*\n                DPCT1025:79: The SYCL queue is created ignoring the flag and priority options.\n                */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          255933
-    Length:          72
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[id][is] = dpct::get_current_device().create_queue())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          256084
-    Length:          35
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[id] = &dpct::get_in_order_queue())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          256122
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1027:80: The call to cublasSetMathMode was replaced with 0 because this functionality is redundant in SYCL.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          256147
-    Length:          67
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          256401
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257037
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257152
-    Length:          11
-    ReplacementText: 'dpct::err0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257170
-    Length:          36
-    ReplacementText: 'DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257208
-    Length:          0
-    ReplacementText: "    /*\n    DPCT1000:82: Error handling if-stmt was detected but could not be rewritten.\n    */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257223
-    Length:          11
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257265
-    Length:          28
-    ReplacementText: "        /*\n        DPCT1026:83: The call to cudaGetLastError was removed because this functionality is redundant in SYCL.\n        */\n        /*\n        DPCT1001:81: The statement could not be removed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257379
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1009:84: SYCL uses exceptions to report errors and does not use the error codes. The original code was commented out and a warning string was inserted. You need to rewrite this code.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257411
-    Length:          23
-    ReplacementText: '"cudaGetErrorString is not supported"/*cudaGetErrorString(err)*/'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257485
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257524
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257541
-    Length:          17
-    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257558
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257562
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257571
-    Length:          11
-    ReplacementText: 'dpct::err0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257713
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257734
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257741
-    Length:          14
-    ReplacementText: 'dpct::memcpy_direction'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          257841
-    Length:          22
-    ReplacementText: 'dpct::host_to_device'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          258114
-    Length:          24
-    ReplacementText: 'dpct::device_to_device'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          258253
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          258879
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          258918
-    Length:          6
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          258924
-    Length:          8
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          258933
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          258978
-    Length:          17
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::async_dpct_memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259050
-    Length:          0
-    ReplacementText: '*'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259057
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259323
-    Length:          11
-    ReplacementText: 'dpct::err0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259339
-    Length:          17
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::async_dpct_memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259395
-    Length:          0
-    ReplacementText: '*'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259402
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259404
-    Length:          0
-    ReplacementText: "            /*\n            DPCT1001:85: The statement could not be removed.\n            */\n            /*\n            DPCT1000:86: Error handling if-stmt was detected but could not be rewritten.\n            */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259425
-    Length:          11
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259473
-    Length:          11
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259493
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          259674
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          260139
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          261515
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          261845
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          261872
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          262015
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          262476
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          262832
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          263151
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          263965
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          264284
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          264604
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          265034
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          265470
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          265906
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          266336
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          266772
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          267310
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          267739
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          268320
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          268937
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          269563
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          270147
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          270721
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          271448
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          271471
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          271702
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          274068
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          276458
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          278619
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          281822
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          281845
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          282160
-    Length:          18
-    ReplacementText: 'DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          282780
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283141
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283196
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283260
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283624
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283679
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283743
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283794
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283831
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283875
-    Length:          45
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          283957
-    Length:          396
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::gemm(*g_cublas_handles[id], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00, src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, dst_f16.get(), dpct::library_data_t::real_half, ldc, dpct::library_data_t::real_half))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          285106
-    Length:          45
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[id] = stream)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          285188
-    Length:          244
-    ReplacementText: 'DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(*g_cublas_handles[id], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00, src1_ddf_i, ne10, dpct::get_value(&beta, *g_cublas_handles[id]), dst_dd_i, ldc))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          285515
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          285695
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          287917
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          287934
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          288500
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          288517
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          288959
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          290051
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          291125
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          291444
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          291957
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          292569
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          293193
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          293941
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          294213
-    Length:          0
-    ReplacementText: "    /*\n    DPCT1010:87: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n    */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          294228
-    Length:          18
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          294485
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          294834
-    Length:          0
-    ReplacementText: "    /*\n    DPCT1010:88: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n    */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          294849
-    Length:          18
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          295064
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          296196
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          297085
-    Length:          0
-    ReplacementText: "    /*\n    DPCT1010:89: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n    */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          297100
-    Length:          18
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          297204
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(main_stream->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          297256
-    Length:          24
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          297280
-    Length:          13
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          297294
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          297367
-    Length:          23
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          297400
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          298826
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          302402
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          303369
-    Length:          0
-    ReplacementText: "                /*\n                DPCT1010:90: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n                */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          303396
-    Length:          18
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          304041
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1024:91: The original code returned the error code that was further consumed by the program logic. This original code was replaced with 0. You may need to rewrite the program logic consuming the error code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          304060
-    Length:          86
-    ReplacementText: 'DPCT_CHECK_ERROR(*src0_extra->events[g_main_device][0] = g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          305030
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          305215
-    Length:          68
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier({*src0_extra->events[g_main_device][0]}))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          306734
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          306828
-    Length:          78
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          306906
-    Length:          8
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          306915
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          307176
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          307252
-    Length:          78
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          307330
-    Length:          8
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          307339
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          307967
-    Length:          0
-    ReplacementText: "                    /*\n                    DPCT1010:92: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n                    */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          307998
-    Length:          18
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          308505
-    Length:          0
-    ReplacementText: "                /*\n                DPCT1010:93: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n                */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          308532
-    Length:          18
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          308720
-    Length:          14
-    ReplacementText: 'dpct::memcpy_direction'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          308884
-    Length:          22
-    ReplacementText: 'dpct::device_to_host'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          309086
-    Length:          24
-    ReplacementText: 'dpct::device_to_device'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          310027
-    Length:          17
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::async_dpct_memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          310203
-    Length:          0
-    ReplacementText: '*'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          310210
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          310501
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          310566
-    Length:          6
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          310572
-    Length:          8
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          310581
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          310777
-    Length:          0
-    ReplacementText: "                    /*\n                    DPCT1024:94: The original code returned the error code that was further consumed by the program logic. This original code was replaced with 0. You may need to rewrite the program logic consuming the error code.\n                    */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          310808
-    Length:          51
-    ReplacementText: 'DPCT_CHECK_ERROR(*src0_extra->events[id][is] = stream->ext_oneapi_submit_barrier())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          312109
-    Length:          83
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier({*src0_extra->events[id][is]}))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          312346
-    Length:          23
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          312379
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          316249
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          316870
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          317481
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          317590
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          318161
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          318716
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          318774
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          318919
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          318928
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          318982
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          319008
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          319288
-    Length:          0
-    ReplacementText: ', const sycl::nd_item<3> &item_ct1'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          319310
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          319323
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          319336
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(2)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          319367
-    Length:          10
-    ReplacementText: 'item_ct1.get_group(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          319380
-    Length:          10
-    ReplacementText: 'item_ct1.get_local_range(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          319393
-    Length:          11
-    ReplacementText: 'item_ct1.get_local_id(1)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          319926
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          320948
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          321026
-    Length:          61
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          321237
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          321259
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          321748
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          321858
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          321896
-    Length:          19
-    ReplacementText: 'dpct::library_data_t'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          321958
-    Length:          14
-    ReplacementText: 'dpct::library_data_t'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          321996
-    Length:          10
-    ReplacementText: 'dpct::library_data_t::real_half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          322099
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          322133
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          322435
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          322481
-    Length:          4
-    ReplacementText: 'sycl::half'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          322608
-    Length:          10
-    ReplacementText: 'dpct::library_data_t::real_float'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          324031
-    Length:          613
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::gemm_batch(*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const char *) src0_as_f16, dpct::library_data_t::real_half, nb01/sizeof(sycl::half), src0->nb[2]/sizeof(sycl::half), (const char *) src1_as_f16.get(), dpct::library_data_t::real_half, nb11/sizeof(float), src1->nb[2]/sizeof(float), beta, (      char *)       dst_t, cu_data_type, ne01, dst->nb[2]/sizeof(float), ne12*ne13, cu_compute_type))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          324853
-    Length:          4
-    ReplacementText: 'sycl::range<3>'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          324869
-    Length:          10
-    ReplacementText: 1, ne12, ne13
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          324882
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed the limit. To get the device limit, query info::device::max_work_group_size. Adjust the work-group size if needed.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          324890
-    Length:          319
-    ReplacementText: "{\n          dpct::has_capability_or_fail(main_stream->get_device(), {sycl::aspect::fp16});\n\n          main_stream->submit(\n            [&](sycl::handler &cgh) {\n              const sycl::half * src1_as_f16_get_ct1 = src1_as_f16.get();\n              const void ** ptrs_src_get_ct3 = ptrs_src.get();\n              void ** ptrs_dst_get_ct4 = ptrs_dst.get();\n\n              cgh.parallel_for(\n                sycl::nd_range<3>(block_dims, block_dims), \n                [=](sycl::nd_item<3> item_ct1) {\n                  k_compute_batched_ptrs(src0_as_f16, src1_as_f16_get_ct1, dst_t, ptrs_src_get_ct3, ptrs_dst_get_ct4, ne12, ne13, ne23, nb02, nb03, nb12, nb13, nbd2, nbd3, r2, r3, item_ct1);\n                });\n            });\n        }"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: true
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          325209
-    Length:          1
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          325211
-    Length:          0
-    ReplacementText: "        /*\n        DPCT1010:95: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to rewrite this code.\n        */\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          325230
-    Length:          18
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          325282
-    Length:          499
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::gemm_batch(*g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const void **) (ptrs_src.get() + 0*ne23), dpct::library_data_t::real_half, nb01/sizeof(sycl::half), (const void **) (ptrs_src.get() + 1*ne23), dpct::library_data_t::real_half, nb11/sizeof(float), beta, (      void **) (ptrs_dst.get() + 0*ne23), cu_data_type, ne01, ne23, cu_compute_type))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          326000
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          336638
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          337017
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          337254
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          337312
-    Length:          24
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          337336
-    Length:          8
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          337345
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          337367
-    Length:          29
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          339747
-    Length:          14
-    ReplacementText: 'dpct::memcpy_direction'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          339822
-    Length:          22
-    ReplacementText: 'dpct::host_to_device'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          339847
-    Length:          24
-    ReplacementText: 'dpct::device_to_device'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          339887
-    Length:          14
-    ReplacementText: 'dpct::memcpy_direction'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          339962
-    Length:          22
-    ReplacementText: 'dpct::device_to_host'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          339987
-    Length:          24
-    ReplacementText: 'dpct::device_to_device'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          340538
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          340668
-    Length:          11
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          340679
-    Length:          8
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          340688
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          341614
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          341739
-    Length:          10
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          341749
-    Length:          8
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          341758
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          341888
-    Length:          29
-    ReplacementText: 'DPCT_CHECK_ERROR(stream->wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          341927
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          342360
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          343164
-    Length:          12
-    ReplacementText: 'dpct::queue_ptr'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          344994
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          347078
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          348753
-    Length:          22
-    ReplacementText: 'DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(size, dpct::get_in_order_queue()))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          348950
-    Length:          10
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memset'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          349006
-    Length:          0
-    ReplacementText: '.wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          349039
-    Length:          10
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          349078
-    Length:          24
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          349103
-    Length:          0
-    ReplacementText: '.wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          349281
-    Length:          72
-    ReplacementText: 'DPCT_CHECK_ERROR(extra->events[id][is] = new sycl::event())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          349415
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          349471
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          349874
-    Length:          32
-    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(extra->data_device[id], dpct::get_in_order_queue())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          349906
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          350108
-    Length:          39
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::destroy_event(extra->events[id][is]))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          350200
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          350900
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          353118
-    Length:          33
-    ReplacementText: 'DPCT_CHECK_ERROR(data = (char *)sycl::malloc_device(g_scratch_size, dpct::get_in_order_queue()))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          353512
-    Length:          23
-    ReplacementText: 'DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(size, dpct::get_in_order_queue()))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          353557
-    Length:          10
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memset'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          353582
-    Length:          0
-    ReplacementText: '.wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          353755
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          353838
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          353996
-    Length:          45
-    ReplacementText: 'DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(g_scratch_size, dpct::get_in_order_queue()))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          354826
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          354887
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          355134
-    Length:          10
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          355213
-    Length:          24
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          355238
-    Length:          0
-    ReplacementText: '.wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          355242
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          355840
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356198
-    Length:          14
-    ReplacementText: 'dpct::device_info'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356238
-    Length:          23
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_device_info'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356262
-    Length:          5
-    ReplacementText: prop
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356269
-    Length:          13
-    ReplacementText: 'dpct::dev_mgr::instance().get_device(g_main_device)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356283
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356385
-    Length:          4
-    ReplacementText: 'get_name()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356399
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356823
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356902
-    Length:          26
-    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(g_scratch_buffer, dpct::get_in_order_queue())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356928
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          356964
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362011
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362043
-    Length:          33
-    ReplacementText: 'DPCT_CHECK_ERROR(device_count = dpct::dev_mgr::instance().device_count())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362080
-    Length:          11
-    ReplacementText: '0'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362145
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362242
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362248
-    Length:          14
-    ReplacementText: 'dpct::device_info'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362284
-    Length:          23
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_device_info'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362308
-    Length:          5
-    ReplacementText: prop
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362315
-    Length:          6
-    ReplacementText: 'dpct::dev_mgr::instance().get_device(device)'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362322
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362380
-    Length:          4
-    ReplacementText: 'get_name()'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          362388
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          363487
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          363602
-    Length:          22
-    ReplacementText: 'DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          363624
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          363644
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          363953
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          364993
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365077
-    Length:          31
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365109
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365150
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365299
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365509
-    Length:          23
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365551
-    Length:          10
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365603
-    Length:          24
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365628
-    Length:          0
-    ReplacementText: '.wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365632
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365781
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          365991
-    Length:          23
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          366033
-    Length:          10
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          366091
-    Length:          24
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          366116
-    Length:          0
-    ReplacementText: '.wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          366120
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          366210
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          366365
-    Length:          23
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          366407
-    Length:          10
-    ReplacementText: 'DPCT_CHECK_ERROR(dpct::get_in_order_queue().memset'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          366452
-    Length:          0
-    ReplacementText: '.wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          366456
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          367132
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          367331
-    Length:          26
-    ReplacementText: 'DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(size, dpct::get_in_order_queue()))'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          367548
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          371821
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372103
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372160
-    Length:          24
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372184
-    Length:          36
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372221
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372225
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372367
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372649
-    Length:          15
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372712
-    Length:          24
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372736
-    Length:          36
-    ReplacementText: ''
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372773
-    Length:          0
-    ReplacementText: ')'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372777
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372845
-    Length:          0
-    ReplacementText: ' try '
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          372953
-    Length:          57
-    ReplacementText: 'DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait())'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Offset:          373036
-    Length:          0
-    ReplacementText: "\ncatch (sycl::exception const &exc) {\n  std::cerr << exc.what() << \"Exception caught at file:\" << __FILE__ << \", line:\" << __LINE__ << std::endl;\n  std::exit(1);\n}"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-MainSourceFilesDigest:
-  - MainSourceFile:  '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml-cuda.cu'
-    Digest:          fe16d2da27d2d01e9e6dcb75ef2d0692
-DpctVersion:     18.0.0
-MainHelperFileName: ''
-USMLevel:        ''
-FeatureMap:      {}
-CompileTargets:  {}
-OptionMap:
-  AnalysisScopePath:
-    Value:           '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub'
-    Specified:       false
-  AsyncHandler:
-    Value:           'false'
-    Specified:       false
-  CommentsEnabled:
-    Value:           'false'
-    Specified:       false
-  CompilationsDir:
-    Value:           ''
-    Specified:       false
-  CtadEnabled:
-    Value:           'false'
-    Specified:       false
-  EnablepProfiling:
-    Value:           'true'
-    Specified:       true
-  ExperimentalFlag:
-    Value:           '0'
-    Specified:       false
-  ExplicitClNamespace:
-    Value:           'false'
-    Specified:       false
-  ExplicitNamespace:
-    Value:           '20'
-    Specified:       false
-  ExtensionDDFlag:
-    Value:           '0'
-    Specified:       false
-  ExtensionDEFlag:
-    Value:           '4294967295'
-    Specified:       false
-  HelperFuncPreferenceFlag:
-    Value:           '0'
-    Specified:       false
-  NDRangeDim:
-    Value:           '3'
-    Specified:       false
-  NoDRYPattern:
-    Value:           'false'
-    Specified:       false
-  NoUseGenericSpace:
-    Value:           ''
-    Specified:       true
-  OptimizeMigration:
-    Value:           'false'
-    Specified:       false
-  ProcessAll:
-    Value:           'false'
-    Specified:       false
-  RuleFile:
-    Value:           ''
-    Specified:       false
-  SyclNamedLambda:
-    Value:           'false'
-    Specified:       false
-  UsmLevel:
-    Value:           '1'
-    Specified:       false
-...
diff --git a/dpcpp_out2/ggml-alloc.h b/dpcpp_out2/ggml-alloc.h
deleted file mode 100644
index 64a412468915b..0000000000000
--- a/dpcpp_out2/ggml-alloc.h
+++ /dev/null
@@ -1,92 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-struct ggml_backend;
-struct ggml_backend_buffer;
-struct ggml_backend_buffer_type;
-
-//
-// Legacy API
-//
-
-typedef struct ggml_allocr * ggml_allocr_t;
-
-// initialize allocator for use with CPU backend only
-GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
-
-// initialize allocator for use with ggml-backend
-GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
-
-GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
-
-// tell the allocator to parse nodes following the order described in the list
-// you should call this if your graph are optimized to execute out-of-order
-GGML_API void   ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
-
-GGML_API void   ggml_allocr_free       (ggml_allocr_t alloc);
-GGML_API bool   ggml_allocr_is_measure (ggml_allocr_t alloc);
-GGML_API void   ggml_allocr_reset      (ggml_allocr_t alloc);
-GGML_API void   ggml_allocr_alloc      (ggml_allocr_t alloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_allocr_max_size   (ggml_allocr_t alloc);
-
-GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
-
-//
-// ggml-backend v2 API
-//
-
-// Separate tensor and graph allocator objects
-// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
-// The original API is kept as a wrapper around the new API
-
-// Tensor allocator
-typedef struct ggml_tallocr * ggml_tallocr_t;
-
-GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
-
-GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
-
-GGML_API void   ggml_tallocr_free       (ggml_tallocr_t talloc);
-GGML_API bool   ggml_tallocr_is_measure (ggml_tallocr_t talloc);
-GGML_API void   ggml_tallocr_reset      (ggml_tallocr_t talloc);
-GGML_API void   ggml_tallocr_alloc      (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_tallocr_max_size   (ggml_tallocr_t talloc);
-
-
-// Graph allocator
-typedef struct ggml_gallocr * ggml_gallocr_t;
-
-GGML_API ggml_gallocr_t ggml_gallocr_new(void);
-GGML_API void   ggml_gallocr_free(ggml_gallocr_t galloc);
-
-GGML_API void   ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
-GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
-
-// Allocate tensors from the allocators given by the hash table
-GGML_API void   ggml_gallocr_alloc_graph_n(
-                    ggml_gallocr_t galloc,
-                    struct ggml_cgraph * graph,
-                    struct ggml_hash_set hash_set,
-                    ggml_tallocr_t * hash_node_talloc);
-
-
-// Utils
-// Create a buffer and allocate all the tensors in a ggml_context
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/dpcpp_out2/ggml-backend-impl.h b/dpcpp_out2/ggml-backend-impl.h
deleted file mode 100644
index 05859935a3c2f..0000000000000
--- a/dpcpp_out2/ggml-backend-impl.h
+++ /dev/null
@@ -1,116 +0,0 @@
-#pragma once
-
-// ggml-backend internal header
-
-#include "ggml-backend.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    typedef void * ggml_backend_buffer_type_context_t;
-
-    struct ggml_backend_buffer_type_i {
-        ggml_backend_buffer_t (*alloc_buffer)    (ggml_backend_buffer_type_t buft, size_t size);
-        size_t                (*get_alignment)   (ggml_backend_buffer_type_t buft); // tensor alignment
-        size_t                (*get_alloc_size)  (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
-        bool                  (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
-        // check if tensor data is in host memory
-        // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
-        bool                  (*is_host)         (ggml_backend_buffer_type_t buft);
-    };
-
-    struct ggml_backend_buffer_type {
-        struct ggml_backend_buffer_type_i  iface;
-        ggml_backend_buffer_type_context_t context;
-    };
-
-    // buffer
-    typedef void * ggml_backend_buffer_context_t;
-
-    struct ggml_backend_buffer_i {
-        void   (*free_buffer)    (ggml_backend_buffer_t buffer);
-        //void     (*reset)      (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
-        void * (*get_base)       (ggml_backend_buffer_t buffer);
-        void   (*init_tensor)    (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-        void   (*set_tensor)     (ggml_backend_buffer_t buffer,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void   (*get_tensor)     (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-        // (optional) copy tensor between different buffer-type, allow for single-copy tranfers
-        void   (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void   (*cpy_tensor_to)  (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void   (*clear)          (ggml_backend_buffer_t buffer, uint8_t value);
-    };
-
-    struct ggml_backend_buffer {
-        struct ggml_backend_buffer_i  iface;
-        ggml_backend_buffer_type_t    buft;
-        ggml_backend_buffer_context_t context;
-        size_t size;
-    };
-
-    ggml_backend_buffer_t ggml_backend_buffer_init(
-                   ggml_backend_buffer_type_t      buft,
-            struct ggml_backend_buffer_i           iface,
-                   ggml_backend_buffer_context_t   context,
-                   size_t                          size);
-
-
-    //
-    // Backend
-    //
-
-    typedef void * ggml_backend_context_t;
-
-    struct ggml_backend_i {
-        const char * (*get_name)(ggml_backend_t backend);
-
-        void (*free)(ggml_backend_t backend);
-
-        // buffer allocation
-        ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
-
-        // (optional) asynchroneous tensor data access
-        void (*set_tensor_async)(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-        void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-        // (optional) asynchroneous tensor copy
-        void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-        void (*cpy_tensor_to_async)  (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
-
-        void (*synchronize)(ggml_backend_t backend);
-
-        // compute graph with a plan
-        ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-        void                      (*graph_plan_free)   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-        void                      (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-
-        // compute graph without a plan
-        void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-        // check if the backend supports an operation
-        bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
-    };
-
-    struct ggml_backend {
-        struct ggml_backend_i iface;
-
-        ggml_backend_context_t context;
-    };
-
-
-    //
-    // Backend registry
-    //
-
-    typedef ggml_backend_t (*ggml_backend_init_fn)(const char * params, void * user_data);
-
-    void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/dpcpp_out2/ggml-backend.h b/dpcpp_out2/ggml-backend.h
deleted file mode 100644
index a9d2fddd726a8..0000000000000
--- a/dpcpp_out2/ggml-backend.h
+++ /dev/null
@@ -1,188 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-alloc.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-    typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
-    typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
-    typedef struct ggml_backend * ggml_backend_t;
-    typedef void * ggml_backend_graph_plan_t;
-
-    //
-    // Backend buffer
-    //
-
-    // buffer type
-    GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size);
-    GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
-    GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
-    GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
-    GGML_API bool ggml_backend_buft_is_host         (ggml_backend_buffer_type_t buft);
-
-    // buffer
-    GGML_API void   ggml_backend_buffer_free          (ggml_backend_buffer_t buffer);
-    GGML_API void * ggml_backend_buffer_get_base      (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_size      (ggml_backend_buffer_t buffer);
-    GGML_API void   ggml_backend_buffer_init_tensor   (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
-    GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-    GGML_API void   ggml_backend_buffer_clear         (ggml_backend_buffer_t buffer, uint8_t value);
-    GGML_API bool   ggml_backend_buffer_is_host       (ggml_backend_buffer_t buffer);
-    GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
-
-    //
-    // Backend
-    //
-
-
-    GGML_API const char * ggml_backend_name(ggml_backend_t backend);
-    GGML_API void         ggml_backend_free(ggml_backend_t backend);
-
-    GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t      ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
-    GGML_API size_t                     ggml_backend_get_alignment(ggml_backend_t backend);
-
-    GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend,       struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_tensor_set(      struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
-    GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor,       void * data, size_t offset, size_t size);
-
-    GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
-
-    GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-    GGML_API void ggml_backend_graph_plan_free   (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
-    GGML_API void ggml_backend_graph_compute     (ggml_backend_t backend, struct ggml_cgraph * cgraph);
-    GGML_API bool ggml_backend_supports_op       (ggml_backend_t backend, const struct ggml_tensor * op);
-
-    // tensor copy between different backends
-    GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
-    GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); // automatic fallback to sync copy
-
-    //
-    // CPU backend
-    //
-
-    GGML_API ggml_backend_t ggml_backend_cpu_init(void);
-
-    GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
-    GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
-
-    // Create a backend buffer from an existing pointer
-    GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
-
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
-
-#ifdef GGML_USE_CPU_HBM
-    GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
-#endif
-
-    //
-    // Backend registry
-    //
-
-    // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
-
-    GGML_API size_t                     ggml_backend_reg_get_count(void);
-    GGML_API size_t                     ggml_backend_reg_find_by_name(const char * name);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
-    GGML_API const char *               ggml_backend_reg_get_name(size_t i);
-    GGML_API ggml_backend_t             ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
-    GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
-    GGML_API ggml_backend_buffer_t      ggml_backend_reg_alloc_buffer(size_t i, size_t size);
-
-    //
-    // Backend scheduler
-    //
-
-    // The backend scheduler allows for multiple backends to be used together
-    // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
-    // The backends are selected based on:
-    // - the backend that supports the operation
-    // - the location of the pre-allocated tensors (e.g. the weights)
-    /*
-      Example usage:
-
-        sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, num_backends);
-        // sched is initialized with measure allocators and cannot be used until allocated with a measure graph
-
-        // initialize buffers from a measure graph
-        measure_graph = build_graph(sched); // use the allocr to allocate inputs as needed
-
-        // in build_graph:
-        build_graph(...) {
-            // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
-            alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
-            ggml_allocr_alloc(alloc_cpu, tensor);
-
-            // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
-            struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
-            ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
-        }
-
-        // allocate backend buffers from measure graph
-        ggml_backend_sched_init_measure(sched, measure_graph);
-
-        // the scheduler is now ready to compute graphs
-
-        // compute
-        graph = build_graph(sched);
-        ggml_backend_sched_graph_compute(sched, graph);
-    */
-
-    struct ggml_backend_sched;
-    typedef struct ggml_backend_sched * ggml_backend_sched_t;
-
-    // Initialize a backend scheduler
-    GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends);
-
-    GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
-
-    // Initialize backend buffers from a measure graph
-    GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
-
-    GGML_API ggml_tallocr_t        ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
-    GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
-
-    GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
-
-    // Allocate a graph on the backend scheduler
-    GGML_API void ggml_backend_sched_graph_compute(
-            ggml_backend_sched_t sched,
-            struct ggml_cgraph * graph);
-
-
-    //
-    // Utils
-    //
-
-    struct ggml_backend_graph_copy {
-        ggml_backend_buffer_t buffer;
-        struct ggml_context * ctx_allocated;
-        struct ggml_context * ctx_unallocated;
-        struct ggml_cgraph * graph;
-    };
-
-    // Copy a graph to a different backend
-    GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
-    GGML_API void                           ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
-
-    typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
-
-    // Compare the output of two backends
-    GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
-
-    // Tensor initialization
-    GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
-    GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
-
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/dpcpp_out2/ggml-cuda.dp.cpp b/dpcpp_out2/ggml-cuda.dp.cpp
deleted file mode 100644
index fc6c68cdcef01..0000000000000
--- a/dpcpp_out2/ggml-cuda.dp.cpp
+++ /dev/null
@@ -1,12724 +0,0 @@
-#define DPCT_PROFILING_ENABLED
-#define DPCT_COMPAT_RT_VERSION 12010
-#include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
-#include <algorithm>
-#include <assert.h>
-#include <atomic>
-#include <cinttypes>
-#include <cstddef>
-#include <cstdint>
-#include <float.h>
-#include <limits>
-#include <stdint.h>
-#include <stdio.h>
-#include <vector>
-#include <dpct/blas_utils.hpp>
-
-#if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  HIPBLAS_R_16F
-#define CUDA_R_32F  HIPBLAS_R_32F
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
-#define cublasCreate hipblasCreate
-#define cublasGemmEx hipblasGemmEx
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetStream hipblasSetStream
-#define cublasSgemm hipblasSgemm
-#define cublasStatus_t hipblasStatus_t
-#define cudaDataType_t hipblasDatatype_t //deprecated, new hipblasDatatype not in 5.6
-#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaError_t hipError_t
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEvent_t hipEvent_t
-#define cudaEventDestroy hipEventDestroy
-#define cudaFree hipFree
-#define cudaFreeHost hipHostFree
-#define cudaGetDevice hipGetDevice
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#ifdef GGML_HIP_UMA
-#define cudaMalloc hipMallocManaged
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size)
-#else
-#define cudaMalloc hipMalloc
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-#endif
-#define cudaMemcpy hipMemcpy
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaMemcpyKind hipMemcpyKind
-#define cudaMemset hipMemset
-#define cudaMemsetAsync hipMemsetAsync
-#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-#define cudaSetDevice hipSetDevice
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamFireAndForget hipStreamFireAndForget
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#define __trap abort
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_STATUS_NOT_INITIALIZED HIPBLAS_STATUS_NOT_INITIALIZED
-#define CUBLAS_STATUS_ALLOC_FAILED HIPBLAS_STATUS_ALLOC_FAILED
-#define CUBLAS_STATUS_INVALID_VALUE HIPBLAS_STATUS_INVALID_VALUE
-#define CUBLAS_STATUS_ARCH_MISMATCH HIPBLAS_STATUS_ARCH_MISMATCH
-#define CUBLAS_STATUS_MAPPING_ERROR HIPBLAS_STATUS_MAPPING_ERROR
-#define CUBLAS_STATUS_EXECUTION_FAILED HIPBLAS_STATUS_EXECUTION_FAILED
-#define CUBLAS_STATUS_INTERNAL_ERROR HIPBLAS_STATUS_INTERNAL_ERROR
-#define CUBLAS_STATUS_NOT_SUPPORTED HIPBLAS_STATUS_NOT_SUPPORTED
-#else
-
-#if DPCT_COMPAT_RT_VERSION < 11020
-#define CUBLAS_TF32_TENSOR_OP_MATH CUBLAS_TENSOR_OP_MATH
-#define CUBLAS_COMPUTE_16F CUDA_R_16F
-#define CUBLAS_COMPUTE_32F CUDA_R_32F
-#define cublasComputeType_t cudaDataType_t
-#endif // CUDART_VERSION < 11020
-
-#endif // defined(GGML_USE_HIPBLAS)
-
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-#include <cmath>
-
-#include <dpct/lib_common_utils.hpp>
-
-#define MIN_CC_DP4A   510 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-#define CC_VOLTA      700
-#define CC_OFFSET_AMD 1000000
-#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
-
-#define GGML_CUDA_MAX_NODES 8192
-
-// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
-// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
-// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
-// -  7B quantum model: +100-200 MB
-// - 13B quantum model: +200-400 MB
-//
-//#define GGML_CUDA_FORCE_MMQ
-
-// TODO: improve this to be correct for more hardware
-//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
-//       probably other such cases, and not sure what happens on AMD hardware
-#if !defined(GGML_CUDA_FORCE_MMQ)
-#define CUDA_USE_TENSOR_CORES
-#endif
-
-// max batch size to use MMQ kernels when tensor cores are available
-#define MMQ_MAX_BATCH_SIZE 32
-
-#if defined(GGML_USE_HIPBLAS)
-#define __CUDA_ARCH__ 1300
-
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-    defined(__gfx1150__) || defined(__gfx1151__)
-#define RDNA3
-#endif
-
-#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
-#define RDNA2
-#endif
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-#if __has_builtin(__builtin_elementwise_sub_sat)
-    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
-    return reinterpret_cast<const int&>(c);
-#else
-    int8x4_t c;
-    int16_t tmp;
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-        tmp = va[i] - vb[i];
-        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
-        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
-        c[i] = tmp;
-    }
-    return reinterpret_cast<int&>(c);
-#endif // __has_builtin(__builtin_elementwise_sub_sat)
-}
-
-static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
-    c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(__gfx1100__)
-    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
-#elif defined(__gfx1010__) || defined(__gfx900__)
-    int tmp1;
-    int tmp2;
-    asm("\n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        "
-        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
-        : "v"(a), "v"(b)
-    );
-#else
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
-#endif
-    return c;
-}
-#endif // defined(GGML_USE_HIPBLAS)
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
-
-#if DPCT_COMPAT_RT_VERSION >= 12000
-    static const char *cublas_get_error_str(const int err) {
-        /*
-        DPCT1009:48: SYCL uses exceptions to report errors and does not use the
-        error codes. The original code was commented out and a warning string
-        was inserted. You need to rewrite this code.
-        */
-        return "cublasGetStatusString is not supported" /*cublasGetStatusString(err)*/
-            ;
-    }
-#else
-    static const char * cublas_get_error_str(const cublasStatus_t err) {
-        switch (err) {
-            case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS";
-            case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED";
-            case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED";
-            case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE";
-            case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH";
-            case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR";
-            case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED";
-            case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR";
-            case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED";
-            default: return "unknown error";
-        }
-    }
-#endif // CUDART_VERSION >= 12000
-
-[[noreturn]]
-static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) {
-    fprintf(stderr, "CUDA error: %s: %s\n", stmt, msg);
-    fprintf(stderr, "  in function %s at %s:%d\n", func, file, line);
-    GGML_ASSERT(!"CUDA error");
-}
-
-/*
-DPCT1001:50: The statement could not be removed.
-*/
-/*
-DPCT1000:51: Error handling if-stmt was detected but could not be rewritten.
-*/
-/*
-DPCT1009:52: SYCL uses exceptions to report errors and does not use the error
-codes. The original code was commented out and a warning string was inserted.
-You need to rewrite this code.
-*/
-#define CUDA_CHECK(err) do {                                                   \
-    auto err_ = (err); if (err_ != 0) ggml_cuda_error(                         \
-        #err, __func__, __FILE__, __LINE__,                                    \
-        "cudaGetErrorString is not supported" /*cudaGetErrorString(err_)*/);   \
-} while (0)
-#define CUBLAS_CHECK(err)                                                      \
-    do { auto err_ = (err); if (err_ != 0)                                     \
-             ggml_cuda_error(#err, __func__, __FILE__, __LINE__,               \
-                             cublas_get_error_str(err_)); } while (0)
-
-#if !defined(GGML_USE_HIPBLAS)
-static const char *cu_get_error_str(int err) {
-    const char * err_str;
-    /*
-    DPCT1007:49: Migration of cuGetErrorString is not supported.
-    */
-    cuGetErrorString(err, &err_str);
-    return err_str;
-}
-/*
-DPCT1001:67: The statement could not be removed.
-*/
-/*
-DPCT1000:68: Error handling if-stmt was detected but could not be rewritten.
-*/
-#define CU_CHECK(err)                                                          \
-    do { auto err_ = (err);                                                    \
-         if (err_ != 0) ggml_cuda_error(#err, __func__, __FILE__, __LINE__,    \
-                                        cu_get_error_str(err_)); } while (0)
-#endif
-
-#if DPCT_COMPAT_RT_VERSION >= 11100
-#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
-#else
-#define GGML_CUDA_ASSUME(x)
-#endif // CUDART_VERSION >= 11100
-
-#ifdef GGML_CUDA_F16
-typedef half dfloat; // dequantize float
-typedef half2 dfloat2;
-#else
-typedef float dfloat; // dequantize float
-typedef sycl::float2 dfloat2;
-#endif //GGML_CUDA_F16
-
-static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __dpct_inline__ int get_int_from_uint8(const uint8_t *x8,
-                                              const int &i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __dpct_inline__ int get_int_from_int8_aligned(const int8_t *x8,
-                                                     const int &i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8,
-                                                      const int &i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-template <typename T>
-using to_t_cuda_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
-                             int k, dpct::queue_ptr stream);
-typedef to_t_cuda_t<float> to_fp32_cuda_t;
-typedef to_t_cuda_t<sycl::half> to_fp16_cuda_t;
-
-typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
-typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
-typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-typedef void (*ggml_cuda_op_mul_mat_t)(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream);
-typedef void (*ggml_cuda_op_flatten_t)(const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst, const float *src0_dd,
-                                       const float *src1_dd, float *dst_dd,
-                                       const dpct::queue_ptr &main_stream);
-
-// QK = number of values after dequantization
-// QR = QK / number of values before dequantization
-// QI = number of 32 bit integers before dequantization
-
-#define QK4_0 32
-#define QR4_0 2
-#define QI4_0 (QK4_0 / (4 * QR4_0))
-typedef struct dpct_type_471834 {
-    sycl::half d;           // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-#define QR4_1 2
-#define QI4_1 (QK4_1 / (4 * QR4_1))
-typedef struct dpct_type_143705 {
-    sycl::half2 dm;         // dm.x = delta, dm.y = min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-#define QR5_0 2
-#define QI5_0 (QK5_0 / (4 * QR5_0))
-typedef struct dpct_type_673649 {
-    sycl::half d;           // delta
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-#define QR5_1 2
-#define QI5_1 (QK5_1 / (4 * QR5_1))
-typedef struct dpct_type_135589 {
-    sycl::half2 dm;         // dm.x = delta, dm.y = min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-#define QR8_0 1
-#define QI8_0 (QK8_0 / (4 * QR8_0))
-typedef struct dpct_type_122878 {
-    sycl::half d;           // delta
-    int8_t  qs[QK8_0];      // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-#define QR8_1 1
-#define QI8_1 (QK8_1 / (4 * QR8_1))
-typedef struct dpct_type_143721 {
-    sycl::half2 ds;         // ds.x = delta, ds.y = sum
-    int8_t  qs[QK8_0];      // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
-
-typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
-typedef void (*allocate_tiles_cuda_t)(int **x_ql, sycl::half2 **x_dm,
-                                      int **x_qh, int **x_sc);
-typedef void (*load_tiles_cuda_t)(const void *__restrict__ vx,
-                                  int *__restrict__ x_ql,
-                                  sycl::half2 *__restrict__ x_dm,
-                                  int *__restrict__ x_qh,
-                                  int *__restrict__ x_sc, const int &i_offset,
-                                  const int &i_max, const int &k,
-                                  const int &blocks_per_row);
-typedef float (*vec_dot_q_mul_mat_cuda_t)(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
-    const int &i, const int &j, const int &k);
-
-//================================= k-quants
-
-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#endif
-
-#define QR2_K 4
-#define QI2_K (QK_K / (4*QR2_K))
-typedef struct dpct_type_619598 {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    sycl::half2 dm;          // super-block scale for quantized scales/mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-#define QR3_K 4
-#define QI3_K (QK_K / (4*QR3_K))
-typedef struct dpct_type_138576 {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#ifdef GGML_QKK_64
-    uint8_t scales[2]; // scales, quantized with 8 bits
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    sycl::half d; // super-block scale
-} block_q3_K;
-//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
-
-#define QR4_K 2
-#define QI4_K (QK_K / (4*QR4_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half    dm[2];             // super-block scales/mins
-    uint8_t scales[2];         // 4-bit block scales/mins
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
-typedef struct dpct_type_154943 {
-    sycl::half2 dm;            // super-block scale for quantized scales/mins
-    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
-#endif
-
-#define QR5_K 2
-#define QI5_K (QK_K / (4*QR5_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half d;                  // super-block scale
-    int8_t scales[QK_K/16];  // block scales
-    uint8_t qh[QK_K/8];      // quants, high bit
-    uint8_t qs[QK_K/2];      // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
-typedef struct dpct_type_866817 {
-    sycl::half2 dm;               // super-block scale for quantized scales/mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];           // quants, high bit
-    uint8_t qs[QK_K/2];           // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif
-
-#define QR6_K 2
-#define QI6_K (QK_K / (4*QR6_K))
-typedef struct dpct_type_107281 {
-    uint8_t ql[QK_K/2];   // quants, lower 4 bits
-    uint8_t qh[QK_K/4];   // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales
-    sycl::half d;            // delta
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
-
-#define WARP_SIZE 32
-#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
-
-#define CUDA_GELU_BLOCK_SIZE 256
-#define CUDA_SILU_BLOCK_SIZE 256
-#define CUDA_TANH_BLOCK_SIZE 256
-#define CUDA_RELU_BLOCK_SIZE 256
-#define CUDA_SQR_BLOCK_SIZE 256
-#define CUDA_CPY_BLOCK_SIZE 32
-#define CUDA_SCALE_BLOCK_SIZE 256
-#define CUDA_CLAMP_BLOCK_SIZE 256
-#define CUDA_ROPE_BLOCK_SIZE 256
-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
-#define CUDA_ALIBI_BLOCK_SIZE 32
-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
-#define CUDA_QUANTIZE_BLOCK_SIZE 256
-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#define CUDA_GET_ROWS_BLOCK_SIZE 256
-#define CUDA_UPSCALE_BLOCK_SIZE 256
-#define CUDA_CONCAT_BLOCK_SIZE 256
-#define CUDA_PAD_BLOCK_SIZE 256
-#define CUDA_ACC_BLOCK_SIZE 256
-#define CUDA_IM2COL_BLOCK_SIZE 256
-
-// dmmv = dequantize_mul_mat_vec
-#ifndef GGML_CUDA_DMMV_X
-#define GGML_CUDA_DMMV_X 32
-#endif
-#ifndef GGML_CUDA_MMV_Y
-#define GGML_CUDA_MMV_Y 1
-#endif
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 2
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
-#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
-#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
-
-#define MUL_MAT_SRC1_COL_STRIDE 128
-
-#define MAX_STREAMS 8
-static dpct::queue_ptr g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = {
-    {&dpct::get_in_order_queue()}};
-
-struct ggml_tensor_extra_gpu {
-    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
-    dpct::event_ptr
-        events[GGML_CUDA_MAX_DEVICES]
-              [MAX_STREAMS]; // events for synchronizing multiple GPUs
-};
-
-// this is faster on Windows
-// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
-inline dpct::err0 ggml_cuda_set_device(const int device) try {
-    int current_device;
-    CUDA_CHECK(DPCT_CHECK_ERROR(
-        current_device = dpct::dev_mgr::instance().current_device_id()));
-
-    if (device == current_device) {
-        return 0;
-    }
-
-    /*
-    DPCT1093:53: The "device" device may be not the one intended for use. Adjust
-    the selected device if needed.
-    */
-    return DPCT_CHECK_ERROR(dpct::select_device(device));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static int g_device_count = -1;
-static int g_main_device = 0;
-static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
-
-struct cuda_device_capabilities {
-    int     cc;                 // compute capability
-    bool    vmm;                // virtual memory support
-    size_t  vmm_granularity;    // granularity of virtual memory
-};
-
-static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, false, 0} };
-
-
-static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 0; // disabled by default
-static size_t g_scratch_offset = 0;
-
-static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
-
-[[noreturn]]
-static void bad_arch(const sycl::stream &stream_ct1) {
-    stream_ct1 << "ERROR: ggml-cuda was compiled without support for the "
-                  "current GPU architecture.\n";
-    __trap();
-
-    (void) bad_arch; // suppress unused function warning
-}
-
-static __dpct_inline__ float warp_reduce_sum(float x,
-                                             const sycl::nd_item<3> &item_ct1) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1096:98: The right-most dimension of the work-group used in the SYCL
-        kernel that calls this function may be less than "32". The function
-        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
-        CPU device. Modify the size of the work-group to ensure that the value
-        of the right-most dimension is a multiple of "32".
-        */
-        x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
-    }
-    return x;
-}
-
-static __dpct_inline__ sycl::float2
-warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
-                                                mask);
-        a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
-                                                mask);
-    }
-    return a;
-}
-
-static __dpct_inline__ float warp_reduce_max(float x,
-                                             const sycl::nd_item<3> &item_ct1) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1096:97: The right-most dimension of the work-group used in the SYCL
-        kernel that calls this function may be less than "32". The function
-        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
-        CPU device. Modify the size of the work-group to ensure that the value
-        of the right-most dimension is a multiple of "32".
-        */
-        x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
-                              item_ct1.get_sub_group(), x, mask));
-    }
-    return x;
-}
-
-static __dpct_inline__ float op_repeat(const float a, const float b) {
-    return b;
-}
-
-static __dpct_inline__ float op_add(const float a, const float b) {
-    return a + b;
-}
-
-static __dpct_inline__ float op_mul(const float a, const float b) {
-    return a * b;
-}
-
-static __dpct_inline__ float op_div(const float a, const float b) {
-    return a / b;
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1));
-    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) /
-                   ne3;
-    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) %
-                   ne3;
-
-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    for (int i0 = i0s; i0 < ne0;
-         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
-        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-    }
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    const int i3 = i/(ne2*ne1*ne0);
-    const int i2 = (i/(ne1*ne0)) % ne2;
-    const int i1 = (i/ne0) % ne1;
-    const int i0 = i % ne0;
-
-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-}
-
-static void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= ne) {
-        return;
-    }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
-    }
-}
-
-static void gelu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    float xi = x[i];
-    dst[i] = 0.5f * xi *
-             (1.0f +
-              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
-}
-
-static void silu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
-}
-
-static void gelu_quick_f32(const float *x, float *dst, int k,
-                           const sycl::nd_item<3> &item_ct1) {
-    const float GELU_QUICK_COEF = -1.702f;
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
-}
-
-static void tanh_f32(const float *x, float *dst, int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::tanh((float)(x[i]));
-}
-
-static void relu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0);
-}
-
-static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
-             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
-}
-
-static void sqr_f32(const float * x, float * dst, const int k,
-                    const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * x[i];
-}
-
-template <int block_size>
-static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
-                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    const int tid = item_ct1.get_local_id(2);
-
-    sycl::float2 mean_var = sycl::float2(0.f, 0.f);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        mean_var.x() += xi;
-        mean_var.y() += xi * xi;
-    }
-
-    // sum up partial sums
-    mean_var = warp_reduce_sum(mean_var, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = mean_var;
-        }
-        /*
-        DPCT1118:0: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-        mean_var = s_sum[lane_id];
-        mean_var = warp_reduce_sum(mean_var, item_ct1);
-    }
-
-    const float mean = mean_var.x() / ncols;
-    const float var = mean_var.y() / ncols - mean * mean;
-    const float inv_std = sycl::rsqrt(var + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
-    }
-}
-
-static void concat_f32(const float  *x,const float  *y, float *dst, const int ne0, const int ne02,
-                       const sycl::nd_item<3> &item_ct1) {
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    if (item_ct1.get_group(0) < ne02) { // src0
-        int offset_src =
-            nidx + item_ct1.get_group(1) * ne0 +
-            item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-            dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx + item_ct1.get_group(1) * ne0 +
-            (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
-            dst[offset_dst] = y[offset_src];
-    }
-}
-
-static void upscale_f32(const float  *x, float *dst, const int ne00, const int nb02, const int scale_factor,
-                        const sycl::nd_item<3> &item_ct1) {
-    int ne0 = ne00 * scale_factor;
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int i00 = nidx / scale_factor;
-    int i01 = item_ct1.get_group(1) / scale_factor;
-    int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    dst[offset_dst] = x[offset_src];
-}
-
-static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
-                    const sycl::nd_item<3> &item_ct1) {
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-
-    // operation
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
-        item_ct1.get_group(0) < ne02) {
-        int offset_src = nidx + item_ct1.get_group(1) * ne00 +
-                         item_ct1.get_group(0) * ne00 * ne01;
-            dst[offset_dst] = x[offset_src];
-    } else {
-        dst[offset_dst] = 0.0f;
-    }
-}
-
-template <int block_size>
-static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
-                           const sycl::nd_item<3> &item_ct1, float *s_sum) {
-    int start = item_ct1.get_group(2) * group_size;
-    int end = start + group_size;
-
-    start += item_ct1.get_local_id(2);
-
-    if (end >= ne_elements) {
-        end = ne_elements;
-    }
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int j = start; j < end; j += block_size) {
-        tmp += x[j];
-    }
-
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:1: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:54: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    float mean = tmp / group_size;
-    tmp = 0.0f;
-
-    for (int j = start; j < end; j += block_size) {
-        float xi = x[j] - mean;
-        dst[j] = xi;
-        tmp += xi * xi;
-    }
-
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:2: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:55: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    float variance = tmp / group_size;
-    float scale = sycl::rsqrt(variance + eps);
-    for (int j = start; j < end; j += block_size) {
-        dst[j] *= scale;
-    }
-}
-
-template <int block_size>
-static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
-                         const sycl::nd_item<3> &item_ct1, float *s_sum) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    const int tid = item_ct1.get_local_id(2);
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        tmp += xi * xi;
-    }
-
-    // sum up partial sums
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:3: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    const float mean = tmp / ncols;
-    const float scale = sycl::rsqrt(mean + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = scale * x[row*ncols + col];
-    }
-}
-
-static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q4_0 * x = (const block_q4_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x() = vui & 0xF;
-    v.y() = vui >> 4;
-
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {8.0f, 8.0f});
-    v = __hmul2(v, {d, d});
-#else
-    v.x() = (v.x() - 8.0f) * d;
-    v.y() = (v.y() - 8.0f) * d;
-#endif // GGML_CUDA_F16
-}
-
-static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q4_1 * x = (const block_q4_1 *) vx;
-
-    const dfloat d = x[ib].dm[0];
-    const dfloat m = x[ib].dm[1];
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x() = vui & 0xF;
-    v.y() = vui >> 4;
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-#else
-    v.x() = (v.x() * d) + m;
-    v.y() = (v.y() * d) + m;
-#endif // GGML_CUDA_F16
-}
-
-static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q5_0 * x = (const block_q5_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
-
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {16.0f, 16.0f});
-    v = __hmul2(v, {d, d});
-#else
-    v.x() = (v.x() - 16.0f) * d;
-    v.y() = (v.y() - 16.0f) * d;
-#endif // GGML_CUDA_F16
-}
-
-static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q5_1 * x = (const block_q5_1 *) vx;
-
-    const dfloat d = x[ib].dm[0];
-    const dfloat m = x[ib].dm[1];
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-#else
-    v.x() = (v.x() * d) + m;
-    v.y() = (v.y() * d) + m;
-#endif // GGML_CUDA_F16
-}
-
-static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q8_0 * x = (const block_q8_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    v.x() = x[ib].qs[iqs + 0];
-    v.y() = x[ib].qs[iqs + 1];
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-#else
-    v.x() *= d;
-    v.y() *= d;
-#endif // GGML_CUDA_F16
-}
-
-//================================== k-quants
-
-template<typename dst_t>
-static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_group(2);
-    const block_q2_K * x = (const block_q2_K *) vx;
-
-    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
-    const int n   = tid/32;
-    const int l   = tid - 32*n;
-    const int is  = 8*n + l/16;
-
-    const uint8_t q = x[i].qs[32*n + l];
-    dst_t * y = yy + i*QK_K + 128*n;
-
-    float dall = x[i].dm[0];
-    float dmin = x[i].dm[1];
-    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
-    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
-    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-#else
-    const int is = tid/16;  // 0 or 1
-    const int il = tid%16;  // 0...15
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    dst_t * y = yy + i*QK_K + 16*is + il;
-    float dall = __low2half(x[i].dm);
-    float dmin = __high2half(x[i].dm);
-    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
-#endif
-
-}
-
-template<typename dst_t>
-static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_group(2);
-    const block_q3_K * x = (const block_q3_K *) vx;
-
-#if QK_K == 256
-    const int r = item_ct1.get_local_id(2) / 4;
-    const int tid = r/2;
-    const int is0 = r%2;
-    const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
-    const int n = tid / 4;
-    const int j = tid - 4*n;
-
-    uint8_t m = 1 << (4*n + j);
-    int is = 8*n + 2*j + is0;
-    int shift = 2*j;
-
-    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
-                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
-                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
-                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
-    float d_all = x[i].d;
-    float dl = d_all * (us - 32);
-
-    dst_t * y = yy + i*QK_K + 128*n + 32*j;
-    const uint8_t * q = x[i].qs + 32*n;
-    const uint8_t * hm = x[i].hmask;
-
-    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-#else
-    const int tid = threadIdx.x;
-    const int is  = tid/16;  // 0 or 1
-    const int il  = tid%16;  // 0...15
-    const int im  = il/8;    // 0...1
-    const int in  = il%8;    // 0...7
-
-    dst_t * y = yy + i*QK_K + 16*is + il;
-
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    const uint8_t h = x[i].hmask[in] >> (2*is + im);
-    const float   d = (float)x[i].d;
-
-    if (is == 0) {
-        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    } else {
-        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    }
-#endif
-
-}
-
-#if QK_K == 256
-static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
-    if (j < 4) {
-        d = q[j] & 63; m = q[j + 4] & 63;
-    } else {
-        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-#endif
-
-template<typename dst_t>
-static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-    const block_q4_K * x = (const block_q4_K *) vx;
-
-    const int i = item_ct1.get_group(2);
-
-#if QK_K == 256
-    // assume 32 threads
-    const int tid = item_ct1.get_local_id(2);
-    const int il  = tid/8;
-    const int ir  = tid%8;
-    const int is  = 2*il;
-    const int n   = 4;
-
-    dst_t * y = yy + i*QK_K + 64*il + n*ir;
-
-    const float dall = x[i].dm[0];
-    const float dmin = x[i].dm[1];
-
-    const uint8_t * q = x[i].qs + 32*il + n*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-    for (int l = 0; l < n; ++l) {
-        y[l + 0] = d1 * (q[l] & 0xF) - m1;
-        y[l +32] = d2 * (q[l] >>  4) - m2;
-    }
-#else
-    const int tid = threadIdx.x;
-    const uint8_t * q = x[i].qs;
-    dst_t * y = yy + i*QK_K;
-    const float d = (float)x[i].dm[0];
-    const float m = (float)x[i].dm[1];
-    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
-    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-    const block_q5_K * x = (const block_q5_K *) vx;
-
-    const int i = item_ct1.get_group(2);
-
-#if QK_K == 256
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = item_ct1.get_local_id(2);
-    const int il  = tid/16;   // il is in 0...3
-    const int ir  = tid%16;   // ir is in 0...15
-    const int is  = 2*il;     // is is in 0...6
-
-    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
-
-    const float dall = x[i].dm[0];
-    const float dmin = x[i].dm[1];
-
-    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
-    const uint8_t * qh = x[i].qh + 2*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-
-    uint8_t   hm  = 1 << (2*il);
-    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
-    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
-    hm <<= 1;
-    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
-    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-#else
-    const int tid = threadIdx.x;
-    const uint8_t q = x[i].qs[tid];
-    const int im = tid/8;  // 0...3
-    const int in = tid%8;  // 0...7
-    const int is = tid/16; // 0 or 1
-    const uint8_t h = x[i].qh[in] >> im;
-    const float d = x[i].d;
-    dst_t * y = yy + i*QK_K + tid;
-    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
-    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-    const block_q6_K * x = (const block_q6_K *) vx;
-
-    const int i = item_ct1.get_group(2);
-#if QK_K == 256
-
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = item_ct1.get_local_id(2);
-    const int ip  = tid/32;   // ip is 0 or 1
-    const int il  = tid - 32*ip; // 0...32
-    const int is  = 8*ip + il/16;
-
-    dst_t * y = yy + i*QK_K + 128*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t * ql = x[i].ql + 64*ip + il;
-    const uint8_t   qh = x[i].qh[32*ip + il];
-    const int8_t  * sc = x[i].scales + is;
-
-    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
-    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-#else
-
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int ip  = tid/16;         // 0 or 1
-    const int il  = tid - 16*ip;    // 0...15
-
-    dst_t * y = yy + i*QK_K + 16*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t   ql = x[i].ql[16*ip + il];
-    const uint8_t   qh = x[i].qh[il] >> (2*ip);
-    const int8_t  * sc = x[i].scales;
-
-    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-#endif
-}
-
-/*
-DPCT1110:4: The total declared local variable size in device function
-dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q2_K * x = (const block_q2_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = x[i].dm[0];
-        const float dmin = x[i].dm[1];
-
-        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp += dall * sum1 - dmin * sum2;
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;
-
-    uint32_t uaux[2];
-    const uint8_t * d = (const uint8_t *)uaux;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint32_t * s = (const uint32_t *)x[i].scales;
-
-        uaux[0] = s[0] & 0x0f0f0f0f;
-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-
-        const float2 dall = __half22float2(x[i].dm);
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t ql = q[l];
-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
-                  + y[l+16] * d[1] * ((ql >> 2) & 3)
-                  + y[l+32] * d[2] * ((ql >> 4) & 3)
-                  + y[l+48] * d[3] * ((ql >> 6) & 3);
-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
-        }
-        tmp += dall.x * sum1 - dall.y * sum2;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-/*
-DPCT1110:5: The total declared local variable size in device function
-dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q3_K * x = (const block_q3_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-        const uint8_t * h = x[i].hmask + l0;
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp += d * sum;
-
-    }
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
-    const int in = offset/8;                                 // 0 or 1
-    const int im = offset%8;                                 // 0...7
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint8_t * s = x[i].scales;
-
-        const float dall = (float)x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t hl = x[i].hmask[im+l] >> in;
-            const uint8_t ql = q[l];
-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-/*
-DPCT1110:6: The total declared local variable size in device function
-dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q4_K * x = (const block_q4_K *)vx + ib0;
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
-
-    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
-
-    const int il  = tid/step;                            // 0...3
-    const int ir  = tid - step*il;                       // 0...7 or 0...3
-    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-#if K_QUANTS_PER_ITERATION == 2
-    uint32_t q32[4];
-    const uint8_t * q4 = (const uint8_t *)q32;
-#else
-    uint16_t q16[4];
-    const uint8_t * q4 = (const uint8_t *)q16;
-#endif
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y1 = yy + i*QK_K + y_offset;
-        const float   * y2 = y1 + 128;
-
-        const float dall = x[i].dm[0];
-        const float dmin = x[i].dm[1];
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-#if K_QUANTS_PER_ITERATION == 2
-        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
-        const uint32_t * q2 = q1 + 16;
-
-        q32[0] = q1[0] & 0x0f0f0f0f;
-        q32[1] = q1[0] & 0xf0f0f0f0;
-        q32[2] = q2[0] & 0x0f0f0f0f;
-        q32[3] = q2[0] & 0xf0f0f0f0;
-
-        sycl::float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 4; ++l) {
-            s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4];
-            s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
-                       s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
-               dmin * smin;
-#else
-        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
-        const uint16_t * q2 = q1 + 32;
-
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[0] & 0xf0f0;
-        q16[2] = q2[0] & 0x0f0f;
-        q16[3] = q2[0] & 0xf0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 2; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
-            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#endif
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    float tmp = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const float   * y = yy + i*QK_K + step;
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        const float d = (float)x[i].dm[0];
-        const float m = (float)x[i].dm[1];
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
-        }
-        tmp += sum;
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-/*
-DPCT1110:7: The total declared local variable size in device function
-dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    const int row = item_ct1.get_group(2);
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q5_K * x = (const block_q5_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = item_ct1.get_local_id(2) / 2; // 0...15
-    const int ix = item_ct1.get_local_id(2) % 2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    uint16_t q16[8];
-    const uint8_t * q4 = (const uint8_t *)q16;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        const uint8_t * ql1 = x[i].qs + q_offset;
-        const uint8_t * qh  = x[i].qh + l0;
-        const float   * y1  = yy + i*QK_K + y_offset;
-        const float   * y2  = y1 + 128;
-
-        const float dall = x[i].dm[0];
-        const float dmin = x[i].dm[1];
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        const uint16_t * q1 = (const uint16_t *)ql1;
-        const uint16_t * q2 = q1 + 32;
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[8] & 0x0f0f;
-        q16[2] = (q1[0] >> 4) & 0x0f0f;
-        q16[3] = (q1[8] >> 4) & 0x0f0f;
-        q16[4] = q2[0] & 0x0f0f;
-        q16[5] = q2[8] & 0x0f0f;
-        q16[6] = (q2[0] >> 4) & 0x0f0f;
-        q16[7] = (q2[8] >> 4) & 0x0f0f;
-        for (int l = 0; l < n; ++l) {
-            sum.x() +=
-                y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
-                y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
-            sum.y() +=
-                y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
-                y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
-            sum.z() +=
-                y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
-                y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
-            sum.w() +=
-                y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
-                y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
-                       sum.w() * sc[5]) -
-               dmin * smin;
-    }
-
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-    const int step = tid * K_QUANTS_PER_ITERATION;
-    const int im = step/8;
-    const int in = step%8;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const int8_t  * s = x[i].scales;
-        const float   * y = yy + i*QK_K + step;
-        const float     d = x[i].d;
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            const uint8_t h = x[i].qh[in+j] >> im;
-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-
-#if QK_K == 256
-
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-#if K_QUANTS_PER_ITERATION == 1
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-#else
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-#endif
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * ql = x[i].ql + ql_offset;
-        const uint8_t * qh = x[i].qh + qh_offset;
-        const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = x[i].d;
-
-#if K_QUANTS_PER_ITERATION == 1
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp += sum;
-#else
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp += sum;
-#endif
-
-    }
-
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + step;
-        const uint8_t * ql = x[i].ql + step;
-        const uint8_t * qh = x[i].qh + step;
-        const int8_t  * s  = x[i].scales;
-
-        const float d = x[i+0].d;
-
-        float sum = 0;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
-        }
-        tmp += sum;
-
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const sycl::half *x = (const sycl::half *)vx;
-
-    // automatic half -> float type cast if dfloat == float
-    v.x() = x[ib + iqs + 0];
-    v.y() = x[ib + iqs + 1];
-}
-
-static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const float * x = (const float *) vx;
-
-    // automatic half -> float type cast if dfloat == float
-    v.x() = x[ib + iqs + 0];
-    v.y() = x[ib + iqs + 1];
-}
-
-static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
-                          const sycl::nd_item<3> &item_ct1) {
-    const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                   item_ct1.get_local_id(2);
-
-    if (ix >= kx_padded) {
-        return;
-    }
-
-    const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                   item_ct1.get_local_id(1);
-
-    const int i_padded = iy*kx_padded + ix;
-
-    block_q8_1 * y = (block_q8_1 *) vy;
-
-    const int ib = i_padded / QK8_1; // block index
-    const int iqs = i_padded % QK8_1; // quant index
-
-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
-    float amax = sycl::fabs((float)xi);
-    float sum = xi;
-
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
-                                    item_ct1.get_sub_group(), amax, mask));
-        sum +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
-    }
-
-    const float d = amax / 127;
-    const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d);
-
-    y[ib].qs[iqs] = q;
-
-    if (iqs > 0) {
-        return;
-    }
-
-    reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
-    reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
-}
-
-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void k_get_rows(
-            const void * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12,
-            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
-
-    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                     item_ct1.get_local_id(2)) *
-                    2;
-    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) /
-                    ne12;
-    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) %
-                    ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
-
-    const int ib = i00/qk; // block index
-    const int iqs = (i00%qk)/qr; // quant index
-    const int iybs = i00 - i00%qk; // dst block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(src0_row, ib, iqs, v);
-
-    dst_row[iybs + iqs + 0] = v.x();
-    dst_row[iybs + iqs + y_offset] = v.y();
-}
-
-template<typename src0_t, typename dst_t>
-static void k_get_rows_float(
-            const src0_t * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12,
-            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
-
-    const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                    item_ct1.get_local_id(2);
-    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) /
-                    ne12;
-    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) %
-                    ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
-
-    dst_row[i00] = src0_row[i00];
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
-                             const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  2 * item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    const int ib = i/qk; // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(vx, ib, iqs, v);
-
-    y[iybs + iqs + 0] = v.x();
-    y[iybs + iqs + y_offset] = v.y();
-}
-
-// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
-// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
-
-#define VDR_Q4_0_Q8_1_MMVQ 2
-#define VDR_Q4_0_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
-                                                    const float &d4,
-                                                    const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
-        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
-    }
-
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-
-    // second part effectively subtracts 8 from each quant value
-    return d4 * (sumi * ds8f.x() - (8 * vdr / QI4_0) * ds8f.y());
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q4_1_Q8_1_MMVQ 2
-#define VDR_Q4_1_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
-                                                    const sycl::half2 &dm4,
-                                                    const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = dpct::dp4a(vi0, u[2 * i + 0], sumi);
-        sumi = dpct::dp4a(vi1, u[2 * i + 1], sumi);
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
-    const float d4d8 = tmp.x;
-    const float m4s8 = tmp.y;
-#else
-    const sycl::float2 dm4f =
-        dm4.convert<float, sycl::rounding_mode::automatic>();
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-    const float d4d8 = dm4f.x() * ds8f.x();
-    const float m4s8 = dm4f.y() * ds8f.y();
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
-    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_0_Q8_1_MMVQ 2
-#define VDR_Q5_0_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float
-vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
-                       const float &d5, const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = dpct::dp4a(vi0, u[2 * i + 0],
-                          sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = dpct::dp4a(vi1, u[2 * i + 1],
-                          sumi); // SIMD dot product of quantized values
-    }
-
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-
-    // second part effectively subtracts 16 from each quant value
-    return d5 * (sumi * ds8f.x() - (16 * vdr / QI5_0) * ds8f.y());
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_1_Q8_1_MMVQ 2
-#define VDR_Q5_1_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float
-vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
-                       const sycl::half2 &dm5, const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = dpct::dp4a(vi0, u[2 * i + 0],
-                          sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = dpct::dp4a(vi1, u[2 * i + 1],
-                          sumi); // SIMD dot product of quantized values
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
-    const float d5d8 = tmp.x;
-    const float m5s8 = tmp.y;
-#else
-    const sycl::float2 dm5f =
-        dm5.convert<float, sycl::rounding_mode::automatic>();
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-    const float d5d8 = dm5f.x() * ds8f.x();
-    const float m5s8 = dm5f.y() * ds8f.y();
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
-    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q8_0_Q8_1_MMVQ 2
-#define VDR_Q8_0_Q8_1_MMQ 8
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
-                                                    const float &d8_0,
-                                                    const float &d8_1) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = dpct::dp4a(v[i], u[i], sumi);
-    }
-
-    return d8_0*d8_1 * sumi;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
-                                                    const sycl::half2 &dm8,
-                                                    const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = dpct::dp4a(v[i], u[i], sumi);
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
-    const float d8d8 = tmp.x;
-    const float m8s8 = tmp.y;
-#else
-    const sycl::float2 dm8f =
-        dm8.convert<float, sycl::rounding_mode::automatic>();
-    const sycl::float2 ds8f =
-        ds8.convert<float, sycl::rounding_mode::automatic>();
-    const float d8d8 = dm8f.x() * ds8f.x();
-    const float m8s8 = dm8f.y() * ds8f.y();
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
-    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q2_K_Q8_1_MMVQ 1
-#define VDR_Q2_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
-    const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
-    const sycl::half2 &dm2, const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++i) {
-        const int sc = scales[2*i];
-
-        const int vi = (v >> (2*i)) & 0x03030303;
-
-        sumf_d +=
-            d8[i] * (dpct::dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-        sumf_m += d8[i] *
-                  dpct::dp4a(
-                      m, u[i],
-                      0); // multiply constant q2_K part with sum of q8_1 values
-    }
-
-    const sycl::float2 dm2f =
-        dm2.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm2f.x() * sumf_d - dm2f.y() * sumf_m;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float
-vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
-                           const uint8_t *__restrict__ scales,
-                           const sycl::half2 &dm2, const float &d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi_d = 0;
-    int sumi_m = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
-        int sumi_d_sc = 0;
-
-        const int sc = scales[i0 / (QI8_1/2)];
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-
-#pragma unroll
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_d_sc = dpct::dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
-            sumi_m = dpct::dp4a(m, u[i],
-                                sumi_m); // multiply sum of q8_1 values with m
-        }
-
-        sumi_d += sumi_d_sc * (sc & 0xF);
-    }
-
-    const sycl::float2 dm2f =
-        dm2.convert<float, sycl::rounding_mode::automatic>();
-
-    return d8 * (dm2f.x() * sumi_d - dm2f.y() * sumi_m);
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q3_K_Q8_1_MMVQ 1
-#define VDR_Q3_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
-    const int &vl, const int &vh, const int *__restrict__ u,
-    const uint8_t *__restrict__ scales, const int &scale_offset,
-    const float &d3, const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        const int isc = scale_offset + 2*i;
-
-        const int isc_low = isc % (QK_K/32);
-        const int sc_shift_low = 4 * (isc / (QK_K/32));
-        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
-
-        const int isc_high = isc % (QK_K/64);
-        const int sc_shift_high = 2 * (isc / (QK_K/64));
-        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
-
-        const int sc = (sc_low | sc_high) - 32;
-
-        const int vil = (vl >> (2*i)) & 0x03030303;
-
-        const int vih = ((vh >> i) << 2) & 0x04040404;
-
-        const int vi =
-            dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat());
-
-        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d3 * sumf;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float
-vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
-                           const int8_t *__restrict__ scales, const float &d3,
-                           const float &d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
-        int sumi_sc = 0;
-
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_sc = dpct::dp4a(v[i], u[i], sumi_sc); // SIMD dot product
-        }
-
-        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
-    }
-
-    return d3*d8 * sumi;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q4_K_Q8_1_MMVQ 2
-#define VDR_Q4_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
-    const int *__restrict__ v, const int *__restrict__ u,
-    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K; ++i) {
-        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
-        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int dot1 =
-            dpct::dp4a(v1i, u[2 * i + 1],
-                       dpct::dp4a(v0i, u[2 * i + 0], 0)); // SIMD dot product
-        const int dot2 =
-            dpct::dp4a(0x01010101, u[2 * i + 1],
-                       dpct::dp4a(0x01010101, u[2 * i + 0], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
-    }
-
-    const sycl::float2 dm4f =
-        dm4.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
-    const int *__restrict__ v, const int *__restrict__ u,
-    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = dpct::dp4a((v[j] >> (4 * i)) & 0x0F0F0F0F,
-                                u[i * QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const sycl::float2 ds8f =
-            ds8[i].convert<float, sycl::rounding_mode::automatic>();
-
-        sumf_d += ds8f.x() * (sc[i] * sumi_d);
-        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const sycl::float2 dm4f =
-        dm4.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_K_Q8_1_MMVQ 2
-#define VDR_Q5_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
-    const int *__restrict__ vl, const int *__restrict__ vh,
-    const int *__restrict__ u, const uint8_t *__restrict__ sc,
-    const uint8_t *__restrict__ m, const sycl::half2 &dm5,
-    const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
-        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
-        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
-
-        const int v0i = vl0i | vh0i;
-        const int v1i = vl1i | vh1i;
-
-        const int dot1 =
-            dpct::dp4a(v0i, u[2 * i + 0],
-                       dpct::dp4a(v1i, u[2 * i + 1], 0)); // SIMD dot product
-        const int dot2 =
-            dpct::dp4a(0x01010101, u[2 * i + 0],
-                       dpct::dp4a(0x01010101, u[2 * i + 1], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);
-
-    }
-
-    const sycl::float2 dm5f =
-        dm5.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm5f.x() * sumf_d - dm5f.y() * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
-    const int *__restrict__ v, const int *__restrict__ u,
-    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = dpct::dp4a(v[i * QI8_1 + j], u[i * QI8_1 + j],
-                                sumi_d); // SIMD dot product
-        }
-
-        const sycl::float2 ds8f =
-            ds8[i].convert<float, sycl::rounding_mode::automatic>();
-
-        sumf_d += ds8f.x() * (sc[i] * sumi_d);
-        sumf_m += ds8f.y() * m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const sycl::float2 dm4f =
-        dm4.convert<float, sycl::rounding_mode::automatic>();
-
-    return dm4f.x() * sumf_d - dm4f.y() * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q6_K_Q8_1_MMVQ 1
-#define VDR_Q6_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __dpct_inline__ float
-vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
-                            const int *__restrict__ u,
-                            const int8_t *__restrict__ scales, const float &d,
-                            const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        const int sc = scales[4*i];
-
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-
-        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
-
-        const int vi = dpct::vectorized_binary<sycl::char4>(
-            (vil | vih), 0x20202020, dpct::sub_sat()); // vi = (vil | vih) - 32
-
-        sumf += d8[i] * (dpct::dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d*sumf;
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float
-vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
-                           const int8_t *__restrict__ sc, const float &d6,
-                           const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-
-#pragma unroll
-    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
-        sycl::int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
-
-#pragma unroll
-        for (int i = i0; i < i0 + 2; ++i) {
-            sumi_d.x() = dpct::dp4a(v[2 * i + 0], u[2 * i + 0],
-                                    sumi_d.x()); // SIMD dot product
-            sumi_d.x() = dpct::dp4a(v[2 * i + 1], u[2 * i + 1],
-                                    sumi_d.x()); // SIMD dot product
-
-            sumi_d.y() = dpct::dp4a(v[2 * i + 4], u[2 * i + 4],
-                                    sumi_d.y()); // SIMD dot product
-            sumi_d.y() = dpct::dp4a(v[2 * i + 5], u[2 * i + 5],
-                                    sumi_d.y()); // SIMD dot product
-        }
-
-        sumf_d += d8[i0 / 4] *
-                  (sc[i0 / 2 + 0] * sumi_d.x() + sc[i0 / 2 + 1] * sumi_d.y());
-    }
-
-    return d6 * sumf_d;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-static __dpct_inline__ float
-vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
-
-    int v[VDR_Q4_0_Q8_1_MMVQ];
-    int u[2*VDR_Q4_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
-        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs_q4_0, float *tile_x_d_q4_0) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_qs_q4_0;
-    *x_dm = (sycl::half2 *)tile_x_d_q4_0;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_0;
-    const int kqsx = k % QI4_0;
-
-    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
-
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
-        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const float * x_dmf = (const float *) x_dm;
-
-    int u[2*VDR_Q4_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
-
-    int v[VDR_Q4_1_Q8_1_MMVQ];
-    int u[2*VDR_Q4_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
-        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs_q4_1, sycl::half2 *tile_x_dm_q4_1) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_qs_q4_1;
-    *x_dm = tile_x_dm_q4_1;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_1;
-    const int kqsx = k % QI4_1;
-
-    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
-        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-
-    int u[2*VDR_Q4_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
-
-    int vl[VDR_Q5_0_Q8_1_MMVQ];
-    int vh[VDR_Q5_0_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
-        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
-        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
-    }
-
-    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q5_0, float *tile_x_d_q5_0) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_ql_q5_0;
-    *x_dm = (sycl::half2 *)tile_x_d_q5_0;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_0;
-    const int kqsx = k % QI5_0;
-
-    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
-
-        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
-        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
-        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
-        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
-        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
-        qs0 = dpct::vectorized_binary<sycl::char4>(
-            qs0, 0x10101010, dpct::sub_sat()); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
-        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
-        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
-        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
-        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
-        qs1 = dpct::vectorized_binary<sycl::char4>(
-            qs1, 0x10101010, dpct::sub_sat()); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
-        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    int u[2*VDR_Q5_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
-
-    int vl[VDR_Q5_1_Q8_1_MMVQ];
-    int vh[VDR_Q5_1_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
-        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
-        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
-    }
-
-    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q5_1, sycl::half2 *tile_x_dm_q5_1) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_ql_q5_1;
-    *x_dm = tile_x_dm_q5_1;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset < nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_1;
-    const int kqsx = k % QI5_1;
-
-    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
-
-        int qs0 = (ql >>  0) & 0x0F0F0F0F;
-        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
-        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
-        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
-        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4) & 0x0F0F0F0F;
-        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
-        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
-        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
-        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
-        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
-
-    int u[2*VDR_Q5_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-
-    int v[VDR_Q8_0_Q8_1_MMVQ];
-    int u[VDR_Q8_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
-        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
-        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-    }
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
-                                                      bq8_1->ds[0]);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs_q8_0, float *tile_x_d_q8_0) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_qs_q8_0;
-    *x_dm = (sycl::half2 *)tile_x_d_q8_0;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI8_0;
-    const int kqsx = k % QI8_0;
-    float * x_dmf = (float *) x_dm;
-
-    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
-        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
-         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
-
-    const int bq8_offset = QR2_K * (iqs / QI8_1);
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const uint8_t * scales = bq2_K->scales + scale_offset;
-
-    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
-    int    u[QR2_K];
-    float d8[QR2_K];
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++ i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds[0];
-    }
-
-    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q2_K, sycl::half2 *tile_x_dm_q2_K,
-                    int *tile_x_sc_q2_K) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql_q2_K;
-    *x_dm = tile_x_dm_q2_K;
-    *x_sc = tile_x_sc_q2_K;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI2_K;
-    const int kqsx = k % QI2_K;
-
-    const block_q2_K * bx0 = (const block_q2_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
-        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
-    }
-}
-
-static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const int kbx = k / QI2_K;
-    const int ky  = (k % QI2_K) * QR2_K;
-    const float * y_df = (const float *) y_ds;
-
-    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-
-    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
-    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-
-#pragma unroll
-    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
-        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
-    }
-
-    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
-
-    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
-    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
-
-    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const float d = bq3_K->d;
-
-    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
-
-    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
-
-    int    u[QR3_K];
-    float d8[QR3_K];
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds[0];
-    }
-
-    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q3_K, sycl::half2 *tile_x_dm_q3_K,
-                    int *tile_x_qh_q3_K, int *tile_x_sc_q3_K) {
-
-    *x_ql = tile_x_ql_q3_K;
-    *x_dm = tile_x_dm_q3_K;
-    *x_qh = tile_x_qh_q3_K;
-    *x_sc = tile_x_sc_q3_K;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI3_K;
-    const int kqsx = k % QI3_K;
-
-    const block_q3_K * bx0 = (const block_q3_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
-        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
-        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
-
-        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
-
-        const int ksc = k % (QI3_K/4);
-
-        const int ksc_low = ksc % (QI3_K/8);
-        const int shift_low = 4 * (ksc / (QI3_K/8));
-        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
-
-        const int ksc_high = QI3_K/8;
-        const int shift_high = 2 * ksc;
-        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
-
-        const int sc = dpct::vectorized_binary<sycl::char4>(
-            sc_low | sc_high, 0x20202020, dpct::sub_sat());
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-
-    const int kbx  = k / QI3_K;
-    const int ky  = (k % QI3_K) * QR3_K;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
-
-    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
-        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
-        const int shift = 2 * ((ky % 32) / 8);
-        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
-
-        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
-        const int vlh = (vh << 2) & 0x04040404;
-
-        v[l] = dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat());
-    }
-
-    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
-    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    int    v[2];
-    int    u[2*QR4_K];
-    float d8[QR4_K];
-
-    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
-    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
-
-    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
-    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
-    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
-    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
-
-    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    v[0] = q4[0];
-    v[1] = q4[4];
-
-    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-    for (int i = 0; i < QR4_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds[0];
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    const uint16_t * a = (const uint16_t *)bq4_K->scales;
-    aux16[0] = a[0] & 0x0f0f;
-    aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-    const float dall = bq4_K->dm[0];
-    const float dmin = bq4_K->dm[1];
-
-    const float d8_1 = __low2float(bq8_1[0].ds);
-    const float d8_2 = __low2float(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
-    const int v1 = q4[0];
-    const int v2 = q4[4];
-
-    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
-    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
-    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
-    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
-
-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
-
-    return dall * sumf_d - dmin * sumf_m;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q4_K, sycl::half2 *tile_x_dm_q4_K,
-                    int *tile_x_sc_q4_K) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql_q4_K;
-    *x_dm = tile_x_dm_q4_K;
-    *x_sc = tile_x_sc_q4_K;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI4_K; // == k if QK_K == 256
-
-    const block_q4_K * bx0 = (const block_q4_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
-        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-#else
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-
-    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    int   vl[2];
-    int   vh[2];
-    int    u[2*QR5_K];
-    float d8[QR5_K];
-
-    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
-    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
-
-    vl[0] = ql[0];
-    vl[1] = ql[4];
-
-    vh[0] = qh[0] >> bq8_offset;
-    vh[1] = qh[4] >> bq8_offset;
-
-    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds[0];
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    const int8_t * s = bq5_K->scales;
-
-    const float d = bq5_K->d;
-
-    const float d8_1 = __low2half(bq8_1[0].ds);
-    const float d8_2 = __low2half(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * ql = (const int *)bq5_K->qs + (iqs/2);
-    const int vl1 = ql[0];
-    const int vl2 = ql[4];
-
-    const int step = 4 * (iqs/2); // 0, 4, 8, 12
-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
-    const int in = step%8; // 0, 4, 0, 4
-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
-
-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
-
-    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
-                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
-
-    return d * sumf_d;
-
-#else
-    bad_arch();
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql_q5_K, sycl::half2 *tile_x_dm_q5_K,
-                    int *tile_x_sc_q5_K) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql_q5_K;
-    *x_dm = tile_x_dm_q5_K;
-    *x_sc = tile_x_sc_q5_K;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI5_K; // == k if QK_K == 256
-
-    const block_q5_K * bx0 = (const block_q5_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR5_K*kqsx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
-        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
-        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
-
-        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
-        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
-
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
-        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-
-    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
-    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
-    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
-
-    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
-    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
-    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
-
-    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
-    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
-
-    const int8_t * scales = bq6_K->scales + scale_offset;
-
-    int    u[QR6_K];
-    float d8[QR6_K];
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + 2 * i].ds[0];
-    }
-
-    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI6_K; // == k if QK_K == 256
-
-    const block_q6_K * bx0 = (const block_q6_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR6_K*kqsx;
-
-        const int ql = get_int_from_uint8(bxi->ql, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
-        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
-        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
-
-        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
-        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
-
-        x_ql[i * (2 * WARP_SIZE + 1) + kq0] =
-            dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020,
-                                                 dpct::sub_sat());
-        x_ql[i * (2 * WARP_SIZE + 1) + kq1] =
-            dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020,
-                                                 dpct::sub_sat());
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
-        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
-    }
-}
-
-static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
-
-    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
-    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
-    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-}
-
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
-          int mmq_y, int nwarps, load_tiles_cuda_t load_tiles, int vdr,
-          vec_dot_q_mul_mat_cuda_t vec_dot>
-/*
-DPCT1110:8: The total declared local variable size in device function mul_mat_q
-exceeds 128 bytes and may cause high register pressure. Consult with your
-hardware vendor to find the total register size available and adjust the code,
-or use smaller sub-group size to avoid high register pressure.
-*/
-static __dpct_inline__ void
-mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
-          float *__restrict__ dst, const int ncols_x, const int nrows_x,
-          const int ncols_y, const int nrows_y, const int nrows_dst,
-          int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh,
-          int *tile_x_sc, const sycl::nd_item<3> &item_ct1, int *tile_y_qs,
-          sycl::half2 *tile_y_ds) {
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    const int blocks_per_row_x = ncols_x / qk;
-    const int blocks_per_col_y = nrows_y / QK8_1;
-    const int blocks_per_warp = WARP_SIZE / qi;
-
-    const int & ncols_dst = ncols_y;
-
-    const int row_dst_0 = item_ct1.get_group(2) * mmq_y;
-    const int & row_x_0 = row_dst_0;
-
-    const int col_dst_0 = item_ct1.get_group(1) * mmq_x;
-    const int & col_y_0 = col_dst_0;
-
-    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
-
-    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-
-        load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
-                   tile_x_qh, tile_x_sc, item_ct1.get_local_id(1),
-                   nrows_x - row_x_0 - 1, item_ct1.get_local_id(2),
-                   blocks_per_row_x);
-
-#pragma unroll
-        for (int ir = 0; ir < qr; ++ir) {
-            const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2);
-            const int kbxd = kqs / QI8_1;
-
-#pragma unroll
-            for (int i = 0; i < mmq_x; i += nwarps) {
-                const int col_y_eff = dpct::min(
-                    (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i),
-                    ncols_y - 1); // to prevent out-of-bounds memory accesses
-
-                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-
-                const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE +
-                                    kqs % WARP_SIZE;
-                tile_y_qs[index_y] = get_int_from_int8_aligned(
-                    by0->qs, item_ct1.get_local_id(2) % QI8_1);
-            }
-
-#pragma unroll
-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
-                const int ids =
-                    (ids0 + item_ct1.get_local_id(1) * QI8_1 +
-                     item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) %
-                    mmq_x;
-                const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1);
-                const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1);
-
-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-                const sycl::half2 *dsi_src =
-                    &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) +
-                       ir * (WARP_SIZE / QI8_1) + kby]
-                         .ds;
-                sycl::half2 *dsi_dst =
-                    &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby];
-                if (need_sum) {
-                    *dsi_dst = *dsi_src;
-                } else {
-                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = (*dsi_src)[0];
-                }
-            }
-
-            /*
-            DPCT1118:9: SYCL group functions and algorithms must be encountered
-            in converged control flow. You may need to adjust the code.
-            */
-            /*
-            DPCT1065:56: Consider replacing sycl::nd_item::barrier() with
-            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-            better performance if there is no access to global memory.
-            */
-            item_ct1.barrier();
-
-// #pragma unroll // unrolling this loop causes too much register pressure
-            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
-#pragma unroll
-                for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
-                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-                        sum[i / WARP_SIZE][j / nwarps] += vec_dot(
-                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-                            tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i,
-                            item_ct1.get_local_id(1) + j, k);
-                    }
-                }
-            }
-
-            /*
-            DPCT1118:10: SYCL group functions and algorithms must be encountered
-            in converged control flow. You may need to adjust the code.
-            */
-            /*
-            DPCT1065:57: Consider replacing sycl::nd_item::barrier() with
-            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-            better performance if there is no access to global memory.
-            */
-            item_ct1.barrier();
-        }
-    }
-
-#pragma unroll
-    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1);
-
-        if (col_dst >= ncols_dst) {
-            return;
-        }
-
-#pragma unroll
-        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-            const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i;
-
-            if (row_dst >= nrows_dst) {
-                continue;
-            }
-
-            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
-        }
-    }
-}
-
-#define  MMQ_X_Q4_0_RDNA2  64
-#define  MMQ_Y_Q4_0_RDNA2  128
-#define NWARPS_Q4_0_RDNA2  8
-#define  MMQ_X_Q4_0_RDNA1  64
-#define  MMQ_Y_Q4_0_RDNA1  64
-#define NWARPS_Q4_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_0_AMPERE 4
-#define  MMQ_Y_Q4_0_AMPERE 32
-#define NWARPS_Q4_0_AMPERE 4
-#else
-#define  MMQ_X_Q4_0_AMPERE 64
-#define  MMQ_Y_Q4_0_AMPERE 128
-#define NWARPS_Q4_0_AMPERE 4
-#endif
-#define  MMQ_X_Q4_0_PASCAL 64
-#define  MMQ_Y_Q4_0_PASCAL 64
-#define NWARPS_Q4_0_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q4_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_0, float *tile_x_d_q4_0,
-    int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-    const int nwarps = NWARPS_Q4_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-    const int nwarps = NWARPS_Q4_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-    const int nwarps = NWARPS_Q4_0_AMPERE;
-    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_qs_q4_0, tile_x_d_q4_0);
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
-              load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ,
-              vec_dot_q4_0_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-    const int nwarps = NWARPS_Q4_0_PASCAL;
-
-    allocate_tiles_q4_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q4_0_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q4_1_RDNA2  64
-#define  MMQ_Y_Q4_1_RDNA2  128
-#define NWARPS_Q4_1_RDNA2  8
-#define  MMQ_X_Q4_1_RDNA1  64
-#define  MMQ_Y_Q4_1_RDNA1  64
-#define NWARPS_Q4_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_1_AMPERE 4
-#define  MMQ_Y_Q4_1_AMPERE 32
-#define NWARPS_Q4_1_AMPERE 4
-#else
-#define  MMQ_X_Q4_1_AMPERE 64
-#define  MMQ_Y_Q4_1_AMPERE 128
-#define NWARPS_Q4_1_AMPERE 4
-#endif
-#define  MMQ_X_Q4_1_PASCAL 64
-#define  MMQ_Y_Q4_1_PASCAL 64
-#define NWARPS_Q4_1_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q4_1,
-    sycl::half2 *tile_x_dm_q4_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-    const int nwarps = NWARPS_Q4_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-    const int nwarps = NWARPS_Q4_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-    const int nwarps = NWARPS_Q4_1_AMPERE;
-    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_qs_q4_1, tile_x_dm_q4_1);
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
-              load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ,
-              vec_dot_q4_1_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-    const int nwarps = NWARPS_Q4_1_PASCAL;
-    allocate_tiles_q4_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q4_1_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_0_RDNA2  64
-#define  MMQ_Y_Q5_0_RDNA2  128
-#define NWARPS_Q5_0_RDNA2  8
-#define  MMQ_X_Q5_0_RDNA1  64
-#define  MMQ_Y_Q5_0_RDNA1  64
-#define NWARPS_Q5_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_0_AMPERE 4
-#define  MMQ_Y_Q5_0_AMPERE 32
-#define NWARPS_Q5_0_AMPERE 4
-#else
-#define  MMQ_X_Q5_0_AMPERE 128
-#define  MMQ_Y_Q5_0_AMPERE 64
-#define NWARPS_Q5_0_AMPERE 4
-#endif
-#define  MMQ_X_Q5_0_PASCAL 64
-#define  MMQ_Y_Q5_0_PASCAL 64
-#define NWARPS_Q5_0_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q5_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_0, float *tile_x_d_q5_0,
-    int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-    const int nwarps = NWARPS_Q5_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-    const int nwarps = NWARPS_Q5_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-    const int nwarps = NWARPS_Q5_0_AMPERE;
-    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q5_0, tile_x_d_q5_0);
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
-              load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ,
-              vec_dot_q5_0_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-    const int nwarps = NWARPS_Q5_0_PASCAL;
-    allocate_tiles_q5_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q5_0_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_1_RDNA2  64
-#define  MMQ_Y_Q5_1_RDNA2  128
-#define NWARPS_Q5_1_RDNA2  8
-#define  MMQ_X_Q5_1_RDNA1  64
-#define  MMQ_Y_Q5_1_RDNA1  64
-#define NWARPS_Q5_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_1_AMPERE 4
-#define  MMQ_Y_Q5_1_AMPERE 32
-#define NWARPS_Q5_1_AMPERE 4
-#else
-#define  MMQ_X_Q5_1_AMPERE 128
-#define  MMQ_Y_Q5_1_AMPERE 64
-#define NWARPS_Q5_1_AMPERE 4
-#endif
-#define  MMQ_X_Q5_1_PASCAL 64
-#define  MMQ_Y_Q5_1_PASCAL 64
-#define NWARPS_Q5_1_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_1,
-    sycl::half2 *tile_x_dm_q5_1, int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-    const int nwarps = NWARPS_Q5_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-    const int nwarps = NWARPS_Q5_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-    const int nwarps = NWARPS_Q5_1_AMPERE;
-    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q5_1, tile_x_dm_q5_1);
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
-              load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ,
-              vec_dot_q5_1_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-    const int nwarps = NWARPS_Q5_1_PASCAL;
-    allocate_tiles_q5_1<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q5_1_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q8_0_RDNA2  64
-#define  MMQ_Y_Q8_0_RDNA2  128
-#define NWARPS_Q8_0_RDNA2  8
-#define  MMQ_X_Q8_0_RDNA1  64
-#define  MMQ_Y_Q8_0_RDNA1  64
-#define NWARPS_Q8_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q8_0_AMPERE 4
-#define  MMQ_Y_Q8_0_AMPERE 32
-#define NWARPS_Q8_0_AMPERE 4
-#else
-#define  MMQ_X_Q8_0_AMPERE 128
-#define  MMQ_Y_Q8_0_AMPERE 64
-#define NWARPS_Q8_0_AMPERE 4
-#endif
-#define  MMQ_X_Q8_0_PASCAL 64
-#define  MMQ_Y_Q8_0_PASCAL 64
-#define NWARPS_Q8_0_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q8_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_qs_q8_0, float *tile_x_d_q8_0,
-    int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-    const int nwarps = NWARPS_Q8_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-    const int nwarps = NWARPS_Q8_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-    const int nwarps = NWARPS_Q8_0_AMPERE;
-    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_qs_q8_0, tile_x_d_q8_0);
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
-              load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ,
-              vec_dot_q8_0_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-    const int nwarps = NWARPS_Q8_0_PASCAL;
-    allocate_tiles_q8_0<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q8_0_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q2_K_RDNA2  64
-#define  MMQ_Y_Q2_K_RDNA2  128
-#define NWARPS_Q2_K_RDNA2  8
-#define  MMQ_X_Q2_K_RDNA1  128
-#define  MMQ_Y_Q2_K_RDNA1  32
-#define NWARPS_Q2_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q2_K_AMPERE 4
-#define  MMQ_Y_Q2_K_AMPERE 32
-#define NWARPS_Q2_K_AMPERE 4
-#else
-#define  MMQ_X_Q2_K_AMPERE 64
-#define  MMQ_Y_Q2_K_AMPERE 128
-#define NWARPS_Q2_K_AMPERE 4
-#endif
-#define  MMQ_X_Q2_K_PASCAL 64
-#define  MMQ_Y_Q2_K_PASCAL 64
-#define NWARPS_Q2_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q2_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q2_K,
-    sycl::half2 *tile_x_dm_q2_K, int *tile_x_sc_q2_K, int *tile_y_qs,
-    sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-    const int nwarps = NWARPS_Q2_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-    const int nwarps = NWARPS_Q2_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-    const int nwarps = NWARPS_Q2_K_AMPERE;
-    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q2_K, tile_x_dm_q2_K, tile_x_sc_q2_K);
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ,
-              vec_dot_q2_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-    const int nwarps = NWARPS_Q2_K_PASCAL;
-    allocate_tiles_q2_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q2_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q3_K_RDNA2  128
-#define  MMQ_Y_Q3_K_RDNA2  64
-#define NWARPS_Q3_K_RDNA2  8
-#define  MMQ_X_Q3_K_RDNA1  32
-#define  MMQ_Y_Q3_K_RDNA1  128
-#define NWARPS_Q3_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q3_K_AMPERE 4
-#define  MMQ_Y_Q3_K_AMPERE 32
-#define NWARPS_Q3_K_AMPERE 4
-#else
-#define  MMQ_X_Q3_K_AMPERE 128
-#define  MMQ_Y_Q3_K_AMPERE 128
-#define NWARPS_Q3_K_AMPERE 4
-#endif
-#define  MMQ_X_Q3_K_PASCAL 64
-#define  MMQ_Y_Q3_K_PASCAL 64
-#define NWARPS_Q3_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q3_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q3_K,
-    sycl::half2 *tile_x_dm_q3_K, int *tile_x_qh_q3_K, int *tile_x_sc_q3_K,
-    int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-    const int nwarps = NWARPS_Q3_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-    const int nwarps = NWARPS_Q3_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-    const int nwarps = NWARPS_Q3_K_AMPERE;
-    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q3_K, tile_x_dm_q3_K, tile_x_qh_q3_K,
-                               tile_x_sc_q3_K);
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ,
-              vec_dot_q3_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-    const int nwarps = NWARPS_Q3_K_PASCAL;
-    allocate_tiles_q3_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q3_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q4_K_RDNA2  64
-#define  MMQ_Y_Q4_K_RDNA2  128
-#define NWARPS_Q4_K_RDNA2  8
-#define  MMQ_X_Q4_K_RDNA1  32
-#define  MMQ_Y_Q4_K_RDNA1  64
-#define NWARPS_Q4_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_K_AMPERE 4
-#define  MMQ_Y_Q4_K_AMPERE 32
-#define NWARPS_Q4_K_AMPERE 4
-#else
-#define  MMQ_X_Q4_K_AMPERE 64
-#define  MMQ_Y_Q4_K_AMPERE 128
-#define NWARPS_Q4_K_AMPERE 4
-#endif
-#define  MMQ_X_Q4_K_PASCAL 64
-#define  MMQ_Y_Q4_K_PASCAL 64
-#define NWARPS_Q4_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q4_K,
-    sycl::half2 *tile_x_dm_q4_K, int *tile_x_sc_q4_K, int *tile_y_qs,
-    sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-    const int nwarps = NWARPS_Q4_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-    const int nwarps = NWARPS_Q4_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-    const int nwarps = NWARPS_Q4_K_AMPERE;
-    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q4_K, tile_x_dm_q4_K, tile_x_sc_q4_K);
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ,
-              vec_dot_q4_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-    const int nwarps = NWARPS_Q4_K_PASCAL;
-    allocate_tiles_q4_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q4_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_K_RDNA2  64
-#define  MMQ_Y_Q5_K_RDNA2  128
-#define NWARPS_Q5_K_RDNA2  8
-#define  MMQ_X_Q5_K_RDNA1  32
-#define  MMQ_Y_Q5_K_RDNA1  64
-#define NWARPS_Q5_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_K_AMPERE 4
-#define  MMQ_Y_Q5_K_AMPERE 32
-#define NWARPS_Q5_K_AMPERE 4
-#else
-#define  MMQ_X_Q5_K_AMPERE 64
-#define  MMQ_Y_Q5_K_AMPERE 128
-#define NWARPS_Q5_K_AMPERE 4
-#endif
-#define  MMQ_X_Q5_K_PASCAL 64
-#define  MMQ_Y_Q5_K_PASCAL 64
-#define NWARPS_Q5_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql_q5_K,
-    sycl::half2 *tile_x_dm_q5_K, int *tile_x_sc_q5_K, int *tile_y_qs,
-    sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-    const int nwarps = NWARPS_Q5_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-    const int nwarps = NWARPS_Q5_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-    const int nwarps = NWARPS_Q5_K_AMPERE;
-    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql_q5_K, tile_x_dm_q5_K, tile_x_sc_q5_K);
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ,
-              vec_dot_q5_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-    const int nwarps = NWARPS_Q5_K_PASCAL;
-    allocate_tiles_q5_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q5_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q6_K_RDNA2  64
-#define  MMQ_Y_Q6_K_RDNA2  128
-#define NWARPS_Q6_K_RDNA2  8
-#define  MMQ_X_Q6_K_RDNA1  32
-#define  MMQ_Y_Q6_K_RDNA1  64
-#define NWARPS_Q6_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q6_K_AMPERE 4
-#define  MMQ_Y_Q6_K_AMPERE 32
-#define NWARPS_Q6_K_AMPERE 4
-#else
-#define  MMQ_X_Q6_K_AMPERE 64
-#define  MMQ_Y_Q6_K_AMPERE 64
-#define NWARPS_Q6_K_AMPERE 4
-#endif
-#define  MMQ_X_Q6_K_PASCAL 64
-#define  MMQ_Y_Q6_K_PASCAL 64
-#define NWARPS_Q6_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_PASCAL, 2)
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q6_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst,
-    const sycl::nd_item<3> &item_ct1, int *tile_x_ql, sycl::half2 *tile_x_dm,
-    int *tile_x_sc, int *tile_y_qs, sycl::half2 *tile_y_ds) {
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-    const int nwarps = NWARPS_Q6_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-    const int nwarps = NWARPS_Q6_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-    const int nwarps = NWARPS_Q6_K_AMPERE;
-    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc,
-                               tile_x_ql, tile_x_dm, tile_x_sc);
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
-              load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ,
-              vec_dot_q6_K_q8_1_mul_mat>(
-        vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql,
-        tile_x_dm, tile_x_qh, tile_x_sc, item_ct1, tile_y_qs, tile_y_ds);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-    const int nwarps = NWARPS_Q6_K_PASCAL;
-    allocate_tiles_q6_K<mmq_y>(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst, tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc);
-#else
-    (void) vec_dot_q6_K_q8_1_mul_mat;
-    bad_arch();
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
-static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
-                          const sycl::nd_item<3> &item_ct1) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
-// partial sum for each thread
-    float tmp = 0.0f;
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row * blocks_per_row + i +
-                        item_ct1.get_local_id(2) / (qi / vdr); // x block index
-
-        const int iby = (i + item_ct1.get_local_id(2) / (qi / vdr)) *
-                        (qk / QK8_1); // y block index that aligns with ibx
-
-        const int iqs =
-            vdr *
-            (item_ct1.get_local_id(2) %
-             (qi / vdr)); // x block quant index when casting the quants to int
-
-        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
-                                   const sycl::nd_item<3> &item_ct1) {
-    // qk = quantized weights per x block
-    // qr = number of quantized weights per data value in x block
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int tid = item_ct1.get_local_id(2);
-
-    const int iter_stride = 2*GGML_CUDA_DMMV_X;
-    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-// partial sum for each thread
-#ifdef GGML_CUDA_F16
-    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
-#else
-    float tmp = 0.0f;
-#endif // GGML_CUDA_F16
-
-    for (int i = 0; i < ncols; i += iter_stride) {
-        const int col = i + vals_per_iter*tid;
-        const int ib = (row*ncols + col)/qk; // x block index
-        const int iqs = (col%qk)/qr; // x quant index
-        const int iybs = col - col%qk; // y block start index
-
-// processing >2 values per i iter is faster for fast GPUs
-#pragma unroll
-        for (int j = 0; j < vals_per_iter; j += 2) {
-            // process 2 vals per j iter
-
-            // dequantize
-            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
-            dfloat2 v;
-            dequantize_kernel(vx, ib, iqs + j/qr, v);
-
-            // matrix multiplication
-            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
-#ifdef GGML_CUDA_F16
-            tmp += __hmul2(v, {
-                y[iybs + iqs + j/qr + 0],
-                y[iybs + iqs + j/qr + y_offset]
-            });
-#else
-            tmp += v.x() * y[iybs + iqs + j / qr + 0];
-            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
-#endif // GGML_CUDA_F16
-        }
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (tid == 0) {
-#ifdef GGML_CUDA_F16
-        dst[row] = tmp.x + tmp.y;
-#else
-        dst[row] = tmp;
-#endif // GGML_CUDA_F16
-    }
-}
-
-static void mul_mat_p021_f16_f32(
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
-    const sycl::nd_item<3> &item_ct1) {
-
-    const sycl::half *x = (const sycl::half *)vx;
-
-    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                      item_ct1.get_local_id(1);
-    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                        item_ct1.get_local_id(0);
-    const int channel_x = channel / (nchannels_y / nchannels_x);
-
-    const int nrows_y = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x;
-         col_x0 += item_ct1.get_local_range(2)) {
-        const int col_x = col_x0 + item_ct1.get_local_id(2);
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        // x is transposed and permuted
-        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
-        const float xi =
-            sycl::vec<sycl::half, 1>(x[ix])
-                .convert<float, sycl::rounding_mode::automatic>()[0];
-
-        const int row_y = col_x;
-
-
-        // y is not transposed but permuted
-        const int iy = channel*nrows_y + row_y;
-
-        tmp += xi * y[iy];
-    }
-
-    // dst is not transposed and not permuted
-    const int idst = channel*nrows_dst + row_dst;
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
-    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
-    const sycl::nd_item<3> &item_ct1) {
-
-    const sycl::half *x = (const sycl::half *)vx;
-
-    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                      item_ct1.get_local_id(1);
-    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                        item_ct1.get_local_id(0);
-    const int channel_x = channel / channel_x_divisor;
-
-    const int nrows_y   = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst   = row_x;
-
-    const int idst = channel*nrows_dst + row_dst;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x;
-         col_x0 += item_ct1.get_local_range(2)) {
-        const int col_x = col_x0 + item_ct1.get_local_id(2);
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        const int row_y = col_x;
-
-        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
-        const int iy = channel*nrows_y + row_y;
-
-        const float xi =
-            sycl::vec<sycl::half, 1>(x[ix])
-                .convert<float, sycl::rounding_mode::automatic>()[0];
-
-        tmp += xi * y[iy];
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    float * dsti = (float *) cdsti;
-
-    *dsti = *xi;
-}
-
-static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    sycl::half *dsti = (sycl::half *)cdsti;
-
-    *dsti = sycl::vec<float, 1>(*xi)
-                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
-}
-
-static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
-    const sycl::half *xi = (const sycl::half *)cxi;
-    sycl::half *dsti = (sycl::half *)cdsti;
-
-    *dsti = *xi;
-}
-
-template <cpy_kernel_t cpy_1>
-static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
-                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
-                                   const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= ne) {
-        return;
-    }
-
-    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
-    // then combine those indices with the corresponding byte offsets to get the total offsets
-    const int i02 = i / (ne00*ne01);
-    const int i01 = (i - i02*ne01*ne00) / ne00;
-    const int i00 = i - i02*ne01*ne00 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
-
-    const int i12 = i / (ne10*ne11);
-    const int i11 = (i - i12*ne10*ne11) / ne10;
-    const int i10 = i - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
-
-    cpy_1(cx + x_offset, cdst + dst_offset);
-}
-
-static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q8_0 * dsti = (block_q8_0 *) cdsti;
-
-    float amax = 0.0f; // absolute max
-
-    for (int j = 0; j < QK8_0; j++) {
-        const float v = xi[j];
-        amax = sycl::fmax(amax, sycl::fabs((float)v));
-    }
-
-    const float d = amax / ((1 << 7) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK8_0; ++j) {
-        const float x0 = xi[j]*id;
-
-        dsti->qs[j] = sycl::round((float)x0);
-    }
-}
-
-static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_0 * dsti = (block_q4_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK4_0; ++j) {
-        const float v = xi[j];
-        if (amax < sycl::fabs((float)v)) {
-            amax = sycl::fabs((float)v);
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -8;
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK4_0/2; ++j) {
-        const float x0 = xi[0       + j]*id;
-        const float x1 = xi[QK4_0/2 + j]*id;
-
-        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f));
-        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_1 * dsti = (block_q4_1 *) cdsti;
-
-    float vmin = FLT_MAX;
-    float vmax = -FLT_MAX;
-
-    for (int j = 0; j < QK4_1; ++j) {
-        const float v = xi[j];
-
-        if (v < vmin) vmin = v;
-        if (v > vmax) vmax = v;
-    }
-
-    const float d  = (vmax - vmin) / ((1 << 4) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->dm.x() = d;
-    dsti->dm.y() = vmin;
-
-    for (int j = 0; j < QK4_1/2; ++j) {
-        const float x0 = (xi[0       + j] - vmin)*id;
-        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
-
-        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f));
-        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-template <cpy_kernel_t cpy_blck, int qk>
-static void cpy_f32_q(const char * cx, char * cdst, const int ne,
-                                 const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-                                 const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
-                                 const sycl::nd_item<3> &item_ct1) {
-    const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                   item_ct1.get_local_id(2)) *
-                  qk;
-
-    if (i >= ne) {
-        return;
-    }
-
-    const int i02 = i / (ne00*ne01);
-    const int i01 = (i - i02*ne01*ne00) / ne00;
-    const int i00 = (i - i02*ne01*ne00 - i01*ne00);
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
-
-    const int i12 = i / (ne10*ne11);
-    const int i11 = (i - i12*ne10*ne11) / ne10;
-    const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
-
-    cpy_blck(cx + x_offset, cdst + dst_offset);
-}
-
-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
-    return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
-}
-
-struct rope_corr_dims {
-    float v[4];
-};
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
-    }
-    *cos_theta = sycl::cos(theta) * mscale;
-    *sin_theta = sycl::sin(theta) * mscale;
-}
-
-// rope == RoPE == rotary positional embedding
-template<typename T, bool has_pos>
-static void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
-,
-    const sycl::nd_item<3> &item_ct1) {
-    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                         item_ct1.get_local_id(1));
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + 1];
-
-    dst[i + 0] = x0*cos_theta - x1*sin_theta;
-    dst[i + 1] = x0*sin_theta + x1*cos_theta;
-}
-
-template<typename T, bool has_pos>
-static void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
-,
-    const sycl::nd_item<3> &item_ct1) {
-    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                         item_ct1.get_local_id(1));
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int ib = col / n_dims;
-    const int ic = col % n_dims;
-
-    if (ib > 0) {
-        const int i = row*ncols + ib*n_dims + ic;
-
-        dst[i + 0] = x[i + 0];
-        dst[i + 1] = x[i + 1];
-
-        return;
-    }
-
-    const int i  = row*ncols + ib*n_dims + ic/2;
-    const int i2 = row/p_delta_rows;
-
-    float cur_rot = inv_ndims * ic - ib;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base =
-        p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims/2];
-
-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
-}
-
-static void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    int n_ctx
-, const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int half_n_dims = ncols/4;
-
-    if (col >= half_n_dims) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
-     // FIXME: this is likely wrong
-    const int p = pos != nullptr ? pos[i2] : 0;
-
-    const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
-    const float sin_theta = sycl::sin((float)theta);
-    const float cos_theta = sycl::cos((float)theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + half_n_dims];
-
-    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
-    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-
-    const float block_theta =
-        ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
-    const float sin_block_theta = sycl::sin((float)block_theta);
-    const float cos_block_theta = sycl::cos((float)block_theta);
-
-    const float x2 = x[i + half_n_dims * 2];
-    const float x3 = x[i + half_n_dims * 3];
-
-    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
-    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
-}
-
-static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
-                                 const int n_heads_log2_floor, const float m0, const float m1,
-                                 const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i = row*ncols + col;
-
-    const int k = row/k_rows;
-
-    float m_k;
-    if (k < n_heads_log2_floor) {
-        m_k = dpct::pow(m0, k + 1);
-    } else {
-        m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
-    }
-
-    dst[i] = col * m_k + x[i];
-}
-
-static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int row = item_ct1.get_group(1);
-    const int col = item_ct1.get_local_id(2);
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum, item_ct1);
-
-    if (col == 0) {
-        dst[row] = sum;
-    }
-}
-
-template<typename T>
-static inline void swap(T & a, T & b) {
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-template<ggml_sort_order order>
-static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
-                              const sycl::nd_item<3> &item_ct1) {
-    // bitonic sort
-    int col = item_ct1.get_local_id(2);
-    int row = item_ct1.get_group(1);
-
-    if (col >= ncols) return;
-
-    const float * x_row = x + row * ncols;
-    int * dst_row = dst + row * ncols;
-
-    // initialize indices
-    if (col < ncols) {
-        dst_row[col] = col;
-    }
-    /*
-    DPCT1065:58: Consider replacing sycl::nd_item::barrier() with
-    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
-    performance if there is no access to global memory.
-    */
-    item_ct1.barrier();
-
-    for (int k = 2; k <= ncols; k *= 2) {
-        for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                } else {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                }
-            }
-            /*
-            DPCT1118:11: SYCL group functions and algorithms must be encountered
-            in converged control flow. You may need to adjust the code.
-            */
-            /*
-            DPCT1065:59: Consider replacing sycl::nd_item::barrier() with
-            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-            better performance if there is no access to global memory.
-            */
-            item_ct1.barrier();
-        }
-    }
-}
-
-static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
-                              const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int i = row*ncols + col;
-    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
-    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
-    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
-}
-
-static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
-                         const sycl::nd_item<3> &item_ct1, float *buf) {
-    const int tid = item_ct1.get_local_id(2);
-    const int rowx = item_ct1.get_group(2);
-    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
-
-    const int block_size = item_ct1.get_local_range(2);
-
-    const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-    const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-
-    float max_val = -INFINITY;
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const int ix = rowx*ncols + col;
-        const int iy = rowy*ncols + col;
-        max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
-    }
-
-    // find the max value in the block
-    max_val = warp_reduce_max(max_val, item_ct1);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf[lane_id] = -INFINITY;
-        }
-        /*
-        DPCT1118:12: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:60: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-
-        if (lane_id == 0) {
-            buf[warp_id] = max_val;
-        }
-        /*
-        DPCT1118:13: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:61: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-
-        max_val = buf[lane_id];
-        max_val = warp_reduce_max(max_val, item_ct1);
-    }
-
-    float tmp = 0.f;
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const int ix = rowx*ncols + col;
-        const int iy = rowy*ncols + col;
-        const float val =
-            sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
-        tmp += val;
-        dst[ix] = val;
-    }
-
-    // find the sum of exps in the block
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf[lane_id] = 0.f;
-        }
-        /*
-        DPCT1118:14: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:62: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-
-        if (lane_id == 0) {
-            buf[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:15: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:63: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-
-        tmp = buf[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    const float inv_tmp = 1.f / tmp;
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const int i = rowx*ncols + col;
-        dst[i] *= inv_tmp;
-    }
-}
-
-static void scale_f32(const float * x, float * dst, const float scale, const int k,
-                      const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = scale * x[i];
-}
-
-static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k,
-                      const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
-
-static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
-                           int IW, int IH, int OW, int KW, int KH,
-                           int pelements, int CHW, int s0, int s1, int p0,
-                           int p1, int d0, int d1,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_id(2) +
-                  item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (i >= pelements) {
-        return;
-    }
-
-    const int ksize = OW * (KH > 1 ? KW : 1);
-    const int kx = i / ksize;
-    const int kd = kx * ksize;
-    const int ky = (i - kd) / OW;
-    const int ix = i % OW;
-
-    const int64_t iiw = ix * s0 + kx * d0 - p0;
-    const int64_t iih = item_ct1.get_group(1) * s1 + ky * d1 - p1;
-
-    const int64_t offset_dst =
-        (item_ct1.get_group(1) * OW + ix) * CHW +
-        (item_ct1.get_group(0) * (KW * KH) + ky * KW + kx);
-
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        dst[offset_dst] =
-            sycl::vec<float, 1>(0.0f)
-                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
-    } else {
-        const int64_t offset_src = item_ct1.get_group(0) * offset_delta;
-        dst[offset_dst] =
-            sycl::vec<float, 1>(x[offset_src + iih * IW + iiw])
-                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_cuda(const ggml_tensor *src0, const ggml_tensor *src1,
-                          ggml_tensor *dst, const void *src0_dd,
-                          const int32_t *src1_dd, float *dst_dd,
-                          dpct::queue_ptr stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
-    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    GGML_ASSERT(ne00 % 2 == 0);
-
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             k_get_rows<qk, qr, dq>(
-                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-                         });
-
-    (void) dst;
-}
-
-template <typename src0_t>
-static void get_rows_cuda_float(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const src0_t *src0_dd, const int32_t *src1_dd,
-                                float *dst_dd, dpct::queue_ptr stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
-    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-            });
-    }
-
-    (void) dst;
-}
-
-template<float (*bin_op)(const float, const float)>
-struct bin_bcast_cuda {
-    template <typename src0_t, typename src1_t, typename dst_t>
-    void operator()(const struct ggml_tensor *src0,
-                    const struct ggml_tensor *src1, struct ggml_tensor *dst,
-                    const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
-                    dpct::queue_ptr stream) {
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        int nr0 = ne10/ne0;
-        int nr1 = ne11/ne1;
-        int nr2 = ne12/ne2;
-        int nr3 = ne13/ne3;
-
-        int nr[4] = { nr0, nr1, nr2, nr3 };
-
-        // collapse dimensions until first broadcast dimension
-        int64_t cne0[] = {ne0, ne1, ne2, ne3};
-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb0[] = {nb0, nb1, nb2, nb3};
-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
-        auto collapse = [](int64_t cne[]) {
-            cne[0] *= cne[1];
-            cne[1] = cne[2];
-            cne[2] = cne[3];
-            cne[3] = 1;
-        };
-
-        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
-            cnb[1] *= cne[1];
-            cnb[2] *= cne[2];
-            cnb[3] *= cne[3];
-        };
-
-        for (int i = 0; i < 4; i++) {
-            if (nr[i] != 1) {
-                break;
-            }
-            if (i > 0) {
-                collapse_nb(cnb0, cne0);
-                collapse_nb(cnb1, cne1);
-                collapse(cne0);
-                collapse(cne1);
-            }
-        }
-        {
-            int64_t ne0 = cne0[0];
-            int64_t ne1 = cne0[1];
-            int64_t ne2 = cne0[2];
-            int64_t ne3 = cne0[3];
-
-            int64_t ne10 = cne1[0];
-            int64_t ne11 = cne1[1];
-            int64_t ne12 = cne1[2];
-            int64_t ne13 = cne1[3];
-
-            size_t nb0 = cnb0[0];
-            size_t nb1 = cnb0[1];
-            size_t nb2 = cnb0[2];
-            size_t nb3 = cnb0[3];
-
-            size_t nb10 = cnb1[0];
-            size_t nb11 = cnb1[1];
-            size_t nb12 = cnb1[2];
-            size_t nb13 = cnb1[3];
-
-            size_t s0 = nb0 / sizeof(dst_t);
-            size_t s1 = nb1 / sizeof(dst_t);
-            size_t s2 = nb2 / sizeof(dst_t);
-            size_t s3 = nb3 / sizeof(dst_t);
-
-            size_t s10 = nb10 / sizeof(src1_t);
-            size_t s11 = nb11 / sizeof(src1_t);
-            size_t s12 = nb12 / sizeof(src1_t);
-            size_t s13 = nb13 / sizeof(src1_t);
-
-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s10 == 1);
-
-            const int block_size = 128;
-
-            int64_t hne0 = std::max(ne0/2LL, 1LL);
-
-            sycl::range<3> block_dims(1, 1, 1);
-            block_dims[2] = std::min<unsigned int>(hne0, block_size);
-            block_dims[1] = std::min<unsigned int>(
-                ne1, block_size / (unsigned int)block_dims[2]);
-            block_dims[0] = std::min(
-                std::min<unsigned int>(
-                    ne2 * ne3, block_size / (unsigned int)block_dims[2] /
-                                   (unsigned int)block_dims[1]),
-                64U);
-
-            sycl::range<3> block_nums(
-                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
-                (ne1 + block_dims[1] - 1) / block_dims[1],
-                (hne0 + block_dims[2] - 1) / block_dims[2]);
-
-            if (block_nums[0] > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
-                {
-                    dpct::has_capability_or_fail(stream->get_device(),
-                                                 {sycl::aspect::fp16});
-
-                    stream->parallel_for(
-                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
-                                              sycl::range<3>(1, 1, block_size),
-                                          sycl::range<3>(1, 1, block_size)),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_bin_bcast_unravel<bin_op>(
-                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
-                                ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
-                                s13, item_ct1);
-                        });
-                }
-            } else {
-                /*
-                DPCT1049:16: The work-group size passed to the SYCL kernel may
-                exceed the limit. To get the device limit, query
-                info::device::max_work_group_size. Adjust the work-group size if
-                needed.
-                */
-                dpct::has_capability_or_fail(stream->get_device(),
-                                             {sycl::aspect::fp16});
-
-                stream->parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
-                                            ne2, ne3, ne10, ne11, ne12, ne13,
-                                            s1, s2, s3, s11, s12, s13,
-                                            item_ct1);
-                    });
-            }
-        }
-    }
-};
-
-static void acc_f32_cuda(const float *x, const float *y, float *dst,
-                         const int n_elements, const int ne10, const int ne11,
-                         const int ne12, const int nb1, const int nb2,
-                         const int offset, dpct::queue_ptr stream) {
-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
-                    item_ct1);
-        });
-}
-
-static void gelu_f32_cuda(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void silu_f32_cuda(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            silu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void gelu_quick_f32_cuda(const float *x, float *dst, const int k,
-                                dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_quick_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void tanh_f32_cuda(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            tanh_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void relu_f32_cuda(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            relu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void leaky_relu_f32_cuda(const float *x, float *dst, const int k,
-                                const float negative_slope,
-                                dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
-        });
-}
-
-static void sqr_f32_cuda(const float *x, float *dst, const int k,
-                         dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            sqr_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void norm_f32_cuda(const float *x, float *dst, const int ncols,
-                          const int nrows, const float eps,
-                          dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
-                sycl::range<1>(32), cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
-                                            s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    } else {
-        const sycl::range<3> block_dims(1, 1, 1024);
-        /*
-        DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
-                sycl::range<1>(32), cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        norm_f32<1024>(x, dst, ncols, eps, item_ct1,
-                                       s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    }
-}
-
-static void group_norm_f32_cuda(const float *x, float *dst,
-                                const int num_groups, const int group_size,
-                                const int ne_elements, dpct::queue_ptr stream) {
-    static const float eps = 1e-6f;
-    if (group_size < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            const float eps_ct4 = eps;
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        group_norm_f32<WARP_SIZE>(
-                            x, dst, group_size, ne_elements, eps_ct4, item_ct1,
-                            s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    } else {
-        const sycl::range<3> block_dims(1, 1, 1024);
-        /*
-        DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            const float eps_ct4 = eps;
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        group_norm_f32<1024>(x, dst, group_size, ne_elements,
-                                             eps_ct4, item_ct1,
-                                             s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    }
-}
-
-static void concat_f32_cuda(const float *x, const float *y, float *dst,
-                            const int ne0, int ne1, int ne2, int ne02,
-                            dpct::queue_ptr stream) {
-    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            concat_f32(x, y, dst, ne0, ne02, item_ct1);
-        });
-}
-
-static void upscale_f32_cuda(const float *x, float *dst, const int ne00,
-                             const int ne01, const int ne02,
-                             const int scale_factor, dpct::queue_ptr stream) {
-    int ne0 = (ne00 * scale_factor);
-    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
-        });
-}
-
-static void pad_f32_cuda(const float *x, float *dst, const int ne00,
-                         const int ne01, const int ne02, const int ne0,
-                         const int ne1, const int ne2, dpct::queue_ptr stream) {
-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
-        });
-}
-
-static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
-                              const int nrows, const float eps,
-                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        rms_norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
-                                                s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    } else {
-        const sycl::range<3> block_dims(1, 1, 1024);
-        /*
-        DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1,
-                                           s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    }
-}
-
-static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx,
-                                   const int ky, const int kx_padded,
-                                   dpct::queue_ptr stream) {
-    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    const sycl::range<3> num_blocks(1, ky, block_num_x);
-    const sycl::range<3> block_size(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(num_blocks * block_size, block_size),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
-            });
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block_cuda(const void *__restrict__ vx,
-                                  dst_t *__restrict__ y, const int k,
-                                  dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(
-                sycl::range<3>(1, 1, num_blocks) *
-                    sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE),
-                sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
-            });
-    }
-}
-
-template <typename dst_t>
-static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q2_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename dst_t>
-static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q3_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename dst_t>
-static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q4_K(vx, y, item_ct1);
-                             });
-    }
-}
-
-template <typename dst_t>
-static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q5_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename dst_t>
-static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q6_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
-        case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
-        case GGML_TYPE_F32:
-            return dequantize_block_cuda<1, 1, convert_f32>;
-        default:
-            return nullptr;
-    }
-}
-
-static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
-        case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
-        case GGML_TYPE_F16:
-            return dequantize_block_cuda<1, 1, convert_f16>;
-        default:
-            return nullptr;
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q2_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q3_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q4_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q5_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const sycl::range<3> block_dims(1, 1, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q6_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y,
-                                         float *dst, const int ncols,
-                                         const int nrows,
-                                         dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
-                                                          nrows, item_ct1);
-            });
-    }
-}
-
-static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK4_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
-                          vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK4_1 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
-                          vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK5_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
-                          vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK5_1 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
-                          vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK8_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
-                          vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
-                          vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
-                          vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
-                          vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
-                          vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
-                          vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-        nwarps = NWARPS_Q4_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-        nwarps = NWARPS_Q4_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-        nwarps = NWARPS_Q4_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-        nwarps = NWARPS_Q4_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:20: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
-                            tile_x_d_q4_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:21: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q4_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q4_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_0) + mmq_y / QI4_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_0_acc_ct1.get_pointer(),
-                            tile_x_d_q4_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-        nwarps = NWARPS_Q4_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-        nwarps = NWARPS_Q4_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-        nwarps = NWARPS_Q4_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-        nwarps = NWARPS_Q4_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:22: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_1<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:23: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q4_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + +mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_1) + mmq_y / QI4_1),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_1<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q4_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-        nwarps = NWARPS_Q5_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-        nwarps = NWARPS_Q5_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-        nwarps = NWARPS_Q5_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-        nwarps = NWARPS_Q5_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:24: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
-                            tile_x_d_q5_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:25: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q5_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_0) + mmq_y / QI5_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_0_acc_ct1.get_pointer(),
-                            tile_x_d_q5_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-        nwarps = NWARPS_Q5_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-        nwarps = NWARPS_Q5_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-        nwarps = NWARPS_Q5_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-        nwarps = NWARPS_Q5_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:26: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_1<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:27: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_1_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_1) + mmq_y / QI5_1),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_1<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_1_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_1_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-        nwarps = NWARPS_Q8_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-        nwarps = NWARPS_Q8_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q8_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-        nwarps = NWARPS_Q8_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q8_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-        nwarps = NWARPS_Q8_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:28: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q8_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
-                            tile_x_d_q8_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:29: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_qs_q8_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<float, 1> tile_x_d_q8_0_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI8_0) + mmq_y / QI8_0),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q8_0<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_qs_q8_0_acc_ct1.get_pointer(),
-                            tile_x_d_q8_0_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-        nwarps = NWARPS_Q2_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-        nwarps = NWARPS_Q2_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q2_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-        nwarps = NWARPS_Q2_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q2_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-        nwarps = NWARPS_Q2_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:30: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q2_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:31: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI2_K) + mmq_y / QI2_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q2_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q2_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q2_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q2_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q2_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-#if QK_K == 256
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-        nwarps = NWARPS_Q3_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-        nwarps = NWARPS_Q3_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q3_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-        nwarps = NWARPS_Q3_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q3_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-        nwarps = NWARPS_Q3_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q3_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
-                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI3_K) + mmq_y / QI3_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_qh_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 2) + mmq_y / 2), cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q3_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 4) + mmq_y / 4), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q3_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q3_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q3_K_acc_ct1.get_pointer(),
-                            tile_x_qh_q3_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q3_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-#endif
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-        nwarps = NWARPS_Q4_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-        nwarps = NWARPS_Q4_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-        nwarps = NWARPS_Q4_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-        nwarps = NWARPS_Q4_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI4_K) + mmq_y / QI4_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q4_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q4_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q4_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q4_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q4_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-        nwarps = NWARPS_Q5_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-        nwarps = NWARPS_Q5_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-        nwarps = NWARPS_Q5_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-        nwarps = NWARPS_Q5_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI5_K) + mmq_y / QI5_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_q5_K_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q5_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_q5_K_acc_ct1.get_pointer(),
-                            tile_x_dm_q5_K_acc_ct1.get_pointer(),
-                            tile_x_sc_q5_K_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    const int compute_capability = g_device_caps[id].cc;
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-        nwarps = NWARPS_Q6_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-        nwarps = NWARPS_Q6_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q6_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-        nwarps = NWARPS_Q6_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q6_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-        nwarps = NWARPS_Q6_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q6_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            stream->submit([&](sycl::handler &cgh) {
-                sycl::local_accessor<int, 1> tile_x_ql_acc_ct1(
-                    sycl::range<1>(mmq_y * (2 * WARP_SIZE) + mmq_y), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_x_dm_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / QI6_K) + mmq_y / QI6_K),
-                    cgh);
-                sycl::local_accessor<int, 1> tile_x_sc_acc_ct1(
-                    sycl::range<1>(mmq_y * (WARP_SIZE / 8) + mmq_y / 8), cgh);
-                sycl::local_accessor<int, 1> tile_y_qs_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE), cgh);
-                sycl::local_accessor<sycl::half2, 1> tile_y_ds_acc_ct1(
-                    sycl::range<1>(mmq_x * WARP_SIZE / QI8_1), cgh);
-
-                cgh.parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        mul_mat_q6_K<need_check>(
-                            vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y,
-                            nrows_dst, item_ct1,
-                            tile_x_ql_acc_ct1.get_pointer(),
-                            tile_x_dm_acc_ct1.get_pointer(),
-                            tile_x_sc_acc_ct1.get_pointer(),
-                            tile_y_qs_acc_ct1.get_pointer(),
-                            tile_y_ds_acc_ct1.get_pointer());
-                    });
-            });
-        }
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y,
-                                           float *dst, const int ncols_x,
-                                           const int nrows_x,
-                                           const int nchannels_x,
-                                           const int nchannels_y,
-                                           dpct::queue_ptr stream) {
-
-    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
-    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
-                                     nchannels_y, item_ct1);
-            });
-    }
-}
-
-static void ggml_mul_mat_vec_nc_f16_f32_cuda(
-    const void *vx, const float *y, float *dst, const int ncols_x,
-    const int nrows_x, const int row_stride_x, const int nchannels_x,
-    const int nchannels_y, const int channel_stride_x, dpct::queue_ptr stream) {
-
-    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
-    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
-                                       row_stride_x, channel_stride_x,
-                                       nchannels_y / nchannels_x, item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int nb00, const int nb01,
-                                  const int nb02, const int ne10,
-                                  const int ne11, const int nb10,
-                                  const int nb11, const int nb12,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
-                                           nb02, ne10, ne11, nb10, nb11, nb12,
-                                           item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int nb00, const int nb01,
-                                  const int nb02, const int ne10,
-                                  const int ne11, const int nb10,
-                                  const int nb11, const int nb12,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
-                                           nb02, ne10, ne11, nb10, nb11, nb12,
-                                           item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_f32_q8_0_cuda(const char *cx, char *cdst, const int ne,
-                                   const int ne00, const int ne01,
-                                   const int nb00, const int nb01,
-                                   const int nb02, const int ne10,
-                                   const int ne11, const int nb10,
-                                   const int nb11, const int nb12,
-                                   dpct::queue_ptr stream) {
-
-    GGML_ASSERT(ne % QK8_0 == 0);
-    const int num_blocks = ne / QK8_0;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
-                                           sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
-                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
-                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
-                         });
-}
-
-static void ggml_cpy_f32_q4_0_cuda(const char *cx, char *cdst, const int ne,
-                                   const int ne00, const int ne01,
-                                   const int nb00, const int nb01,
-                                   const int nb02, const int ne10,
-                                   const int ne11, const int nb10,
-                                   const int nb11, const int nb12,
-                                   dpct::queue_ptr stream) {
-
-    GGML_ASSERT(ne % QK4_0 == 0);
-    const int num_blocks = ne / QK4_0;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
-                                           sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
-                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
-                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
-                         });
-}
-
-static void ggml_cpy_f32_q4_1_cuda(const char *cx, char *cdst, const int ne,
-                                   const int ne00, const int ne01,
-                                   const int nb00, const int nb01,
-                                   const int nb02, const int ne10,
-                                   const int ne11, const int nb10,
-                                   const int nb11, const int nb12,
-                                   dpct::queue_ptr stream) {
-
-    GGML_ASSERT(ne % QK4_1 == 0);
-    const int num_blocks = ne / QK4_1;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
-                                           sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
-                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
-                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
-                         });
-}
-
-static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int nb00, const int nb01,
-                                  const int nb02, const int ne10,
-                                  const int ne11, const int nb10,
-                                  const int nb11, const int nb12,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
-                                           nb02, ne10, ne11, nb10, nb11, nb12,
-                                           item_ct1);
-            });
-    }
-}
-
-static void scale_f32_cuda(const float *x, float *dst, const float scale,
-                           const int k, dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            scale_f32(x, dst, scale, k, item_ct1);
-        });
-}
-
-static void clamp_f32_cuda(const float *x, float *dst, const float min,
-                           const float max, const int k,
-                           dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            clamp_f32(x, dst, min, max, k, item_ct1);
-        });
-}
-
-template <typename T>
-static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
-                      const int32_t *pos, float freq_scale, int p_delta_rows,
-                      float freq_base, float ext_factor, float attn_factor,
-                      rope_corr_dims corr_dims, dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
-    if (pos == nullptr) {
-        /*
-        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
-                               freq_base, ext_factor, attn_factor, corr_dims,
-                               item_ct1);
-            });
-    } else {
-        /*
-        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
-                              freq_base, ext_factor, attn_factor, corr_dims,
-                              item_ct1);
-            });
-    }
-}
-
-template <typename T>
-static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
-                           const int32_t *pos, float freq_scale,
-                           int p_delta_rows, float freq_base, float ext_factor,
-                           float attn_factor, rope_corr_dims corr_dims,
-                           dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.0f / n_dims;
-
-    if (pos == nullptr) {
-        /*
-        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale,
-                                    p_delta_rows, ext_factor, attn_factor,
-                                    corr_dims, theta_scale, inv_ndims,
-                                    item_ct1);
-            });
-    } else {
-        /*
-        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale,
-                                   p_delta_rows, ext_factor, attn_factor,
-                                   corr_dims, theta_scale, inv_ndims, item_ct1);
-            });
-    }
-}
-
-static void rope_glm_f32_cuda(const float *x, float *dst, int ncols, int nrows,
-                              const int32_t *pos, float freq_scale,
-                              int p_delta_rows, float freq_base, int n_ctx,
-                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 4 == 0);
-    const sycl::range<3> block_dims(1, 1, CUDA_ROPE_BLOCK_SIZE / 4);
-    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
-    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             rope_glm_f32(x, dst, ncols, pos, freq_scale,
-                                          p_delta_rows, freq_base, n_ctx,
-                                          item_ct1);
-                         });
-}
-
-static void alibi_f32_cuda(const float *x, float *dst, const int ncols,
-                           const int nrows, const int k_rows,
-                           const int n_heads_log2_floor, const float m0,
-                           const float m1, dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, 1, CUDA_ALIBI_BLOCK_SIZE);
-    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
-    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             alibi_f32(x, dst, ncols, k_rows,
-                                       n_heads_log2_floor, m0, m1, item_ct1);
-                         });
-}
-
-static void sum_rows_f32_cuda(const float *x, float *dst, const int ncols,
-                              const int nrows, dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-    const sycl::range<3> block_nums(1, nrows, 1);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1)
-                             [[intel::reqd_sub_group_size(32)]] {
-                                 k_sum_rows_f32(x, dst, ncols, item_ct1);
-                             });
-}
-
-static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
-                                 const int nrows, ggml_sort_order order,
-                                 dpct::queue_ptr stream) {
-    // bitonic sort requires ncols to be power of 2
-    GGML_ASSERT((ncols & (ncols - 1)) == 0);
-
-    const sycl::range<3> block_dims(1, 1, ncols);
-    const sycl::range<3> block_nums(1, nrows, 1);
-    if (order == GGML_SORT_ASC) {
-        /*
-        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
-            });
-    } else if (order == GGML_SORT_DESC) {
-        /*
-        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
-            });
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-static void diag_mask_inf_f32_cuda(const float *x, float *dst,
-                                   const int ncols_x, const int nrows_x,
-                                   const int rows_per_channel, const int n_past,
-                                   dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
-    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
-    const sycl::range<3> block_nums(1, block_num_x, nrows_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             diag_mask_inf_f32(x, dst, ncols_x,
-                                               rows_per_channel, n_past,
-                                               item_ct1);
-                         });
-}
-
-static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
-                              const int ncols_x, const int nrows_x,
-                              const int nrows_y, const float scale,
-                              dpct::queue_ptr stream) {
-    int nth = WARP_SIZE;
-    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
-    const sycl::range<3> block_dims(1, 1, nth);
-    const sycl::range<3> block_nums(1, 1, nrows_x);
-    /*
-    DPCT1049:46: The work-group size passed to the SYCL kernel may exceed the
-    limit. To get the device limit, query info::device::max_work_group_size.
-    Adjust the work-group size if needed.
-    */
-    stream->submit([&](sycl::handler &cgh) {
-        /*
-        DPCT1101:96: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
-        replaced with a value. Modify the code to use the original expression,
-        provided in comments, if it is correct.
-        */
-        sycl::local_accessor<float, 1> buf_acc_ct1(
-            sycl::range<1>(32 /*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
-                             buf_acc_ct1.get_pointer());
-            });
-    });
-}
-
-static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH,
-                                int OW, int OH, int KW, int KH, int IC,
-                                int offset_delta, int s0, int s1, int p0,
-                                int p1, int d0, int d1,
-                                dpct::queue_ptr stream) {
-    const int parallel_elements = OW * KW * KH;
-    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
-    sycl::range<3> block_nums(IC, OH, num_blocks);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums *
-                                  sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH,
-                               parallel_elements, (IC * KH * KW), s0, s1, p0,
-                               p1, d0, d1, item_ct1);
-            });
-    }
-}
-
-// buffer pool for cuda
-#define MAX_CUDA_BUFFERS 256
-
-struct scoped_spin_lock {
-    std::atomic_flag& lock;
-    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
-        while (lock.test_and_set(std::memory_order_acquire)) {
-            ; // spin
-        }
-    }
-    ~scoped_spin_lock() {
-        lock.clear(std::memory_order_release);
-    }
-    scoped_spin_lock(const scoped_spin_lock&) = delete;
-    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
-};
-
-static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
-
-// #define DEBUG_CUDA_MALLOC
-struct cuda_buffer {
-    void * ptr = nullptr;
-    size_t size = 0;
-};
-
-static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
-static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0};
-
-static void *ggml_cuda_pool_malloc_leg(size_t size, size_t *actual_size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-#ifdef DEBUG_CUDA_MALLOC
-    int nnz = 0;
-    size_t max_size = 0;
-#endif
-    size_t best_diff = 1ull << 36;
-    int ibest = -1;
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][i];
-        if (b.ptr != nullptr) {
-#ifdef DEBUG_CUDA_MALLOC
-            ++nnz;
-            if (b.size > max_size) max_size = b.size;
-#endif
-            if (b.size >= size) {
-                size_t diff = b.size - size;
-                if (diff < best_diff) {
-                    best_diff = diff;
-                    ibest = i;
-                    if (!best_diff) {
-                        void * ptr = b.ptr;
-                        *actual_size = b.size;
-                        b.ptr = nullptr;
-                        b.size = 0;
-                        return ptr;
-                    }
-                }
-            }
-        }
-    }
-    if (ibest >= 0) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
-        void * ptr = b.ptr;
-        *actual_size = b.size;
-        b.ptr = nullptr;
-        b.size = 0;
-        return ptr;
-    }
-    void * ptr;
-    size_t look_ahead_size = (size_t) (1.05 * size);
-    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(
-                             look_ahead_size, dpct::get_in_order_queue())));
-    *actual_size = look_ahead_size;
-    g_cuda_pool_size[id] += look_ahead_size;
-#ifdef DEBUG_CUDA_MALLOC
-    fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz,
-            (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024));
-#endif
-    return ptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_pool_free_leg(void *ptr, size_t size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][i];
-        if (b.ptr == nullptr) {
-            b.ptr = ptr;
-            b.size = size;
-            return;
-        }
-    }
-    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
-    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
-    g_cuda_pool_size[id] -= size;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-#if !defined(GGML_USE_HIPBLAS)
-// pool with virtual memory
-/*
-DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported.
-*/
-static std::vector<CUmemGenericAllocationHandle>
-    g_cuda_pool_handles[GGML_CUDA_MAX_DEVICES];
-static dpct::device_ptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0};
-static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0};
-static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 36; // 64 GB
-
-static void *ggml_cuda_pool_malloc_vmm(size_t size, size_t *actual_size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-
-    // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
-    const size_t alignment = 128;
-    size = alignment * ((size + alignment - 1) / alignment);
-
-    size_t avail = g_cuda_pool_size[id] - g_cuda_pool_used[id];
-
-    if (size > avail) {
-        // round up to the next multiple of the granularity
-        size_t reserve_size = size - avail;
-        const size_t granularity = g_device_caps[id].vmm_granularity;
-        reserve_size = granularity * ((reserve_size + granularity - 1) / granularity);
-
-        GGML_ASSERT(g_cuda_pool_size[id] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE);
-
-        // allocate more physical memory
-        /*
-        DPCT1082:65: Migration of CUmemAllocationProp type is not supported.
-        */
-        CUmemAllocationProp prop = {};
-        prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-        prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        prop.location.id = id;
-        /*
-        DPCT1082:66: Migration of CUmemGenericAllocationHandle type is not
-        supported.
-        */
-        CUmemGenericAllocationHandle handle;
-        /*
-        DPCT1007:69: Migration of cuMemCreate is not supported.
-        */
-        CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0));
-
-        // reserve virtual address space (if not already reserved)
-        if (g_cuda_pool_addr[id] == 0) {
-            /*
-            DPCT1007:70: Migration of cuMemAddressReserve is not supported.
-            */
-            CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[id],
-                                         CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0));
-        }
-
-        // map at the end of the pool
-        /*
-        DPCT1007:71: Migration of cuMemMap is not supported.
-        */
-        CU_CHECK(cuMemMap(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
-                          reserve_size, 0, handle, 0));
-
-        // set access
-        /*
-        DPCT1082:72: Migration of CUmemAccessDesc type is not supported.
-        */
-        CUmemAccessDesc access = {};
-        access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-        access.location.id = id;
-        access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-        /*
-        DPCT1007:73: Migration of cuMemSetAccess is not supported.
-        */
-        CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[id] + g_cuda_pool_size[id],
-                                reserve_size, &access, 1));
-
-        // add to the pool
-        g_cuda_pool_handles[id].push_back(handle);
-        g_cuda_pool_size[id] += reserve_size;
-
-        //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
-        //       id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024),
-        //       (unsigned long long) (reserve_size/1024/1024));
-    }
-
-    GGML_ASSERT(g_cuda_pool_addr[id] != 0);
-
-    void * ptr = (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]);
-    *actual_size = size;
-    g_cuda_pool_used[id] += size;
-
-#ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: allocated %llu bytes at %llx [%s]\n", id, (unsigned long long) size, ptr);
-#endif
-
-    return ptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_pool_free_vmm(void *ptr, size_t size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-
-#ifdef DEBUG_CUDA_MALLOC
-    printf("cuda pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr);
-#endif
-
-    g_cuda_pool_used[id] -= size;
-
-    // all deallocations must be in reverse order of the allocations
-    GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[id] + g_cuda_pool_used[id]));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    if (g_device_caps[id].vmm) {
-        return ggml_cuda_pool_malloc_vmm(size, actual_size);
-    } else {
-        return ggml_cuda_pool_malloc_leg(size, actual_size);
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_pool_free(void *ptr, size_t size) try {
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-    if (g_device_caps[id].vmm) {
-        ggml_cuda_pool_free_vmm(ptr, size);
-    } else {
-        ggml_cuda_pool_free_leg(ptr, size);
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-#else
-#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg
-#define ggml_cuda_pool_free ggml_cuda_pool_free_leg
-#endif // !defined(GGML_USE_HIPBLAS)
-
-template<typename T>
-struct cuda_pool_alloc {
-    T * ptr = nullptr;
-    size_t actual_size = 0;
-
-    // size is in number of elements
-    T * alloc(size_t size) {
-        GGML_ASSERT(ptr == nullptr);
-        ptr = (T *) ggml_cuda_pool_malloc(size * sizeof(T), &this->actual_size);
-        return ptr;
-    }
-
-    cuda_pool_alloc(size_t size) {
-        alloc(size);
-    }
-
-    ~cuda_pool_alloc() {
-        if (ptr != nullptr) {
-            ggml_cuda_pool_free(ptr, actual_size);
-        }
-    }
-
-    T * get() {
-        return ptr;
-    }
-
-    cuda_pool_alloc() = default;
-    cuda_pool_alloc(const cuda_pool_alloc &) = delete;
-    cuda_pool_alloc(cuda_pool_alloc &&) = delete;
-    cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete;
-    cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete;
-};
-
-static bool g_cublas_loaded = false;
-
-bool ggml_cublas_loaded(void) {
-    return g_cublas_loaded;
-}
-
-void ggml_init_cublas() try {
-    static bool initialized = false;
-
-    if (!initialized) {
-
-#ifdef __HIP_PLATFORM_AMD__
-        // Workaround for a rocBLAS bug when using multiple graphics cards:
-        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
-        rocblas_initialize();
-        CUDA_CHECK(cudaDeviceSynchronize());
-#endif
-
-        if (DPCT_CHECK_ERROR(g_device_count =
-                                 dpct::dev_mgr::instance().device_count()) !=
-            0) {
-            initialized = true;
-            g_cublas_loaded = false;
-            return;
-        }
-
-        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
-        int64_t total_vram = 0;
-#if defined(GGML_CUDA_FORCE_MMQ)
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
-#endif
-#if defined(CUDA_USE_TENSOR_CORES)
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
-#endif
-        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
-        for (int id = 0; id < g_device_count; ++id) {
-            int device_vmm = 0;
-
-#if !defined(GGML_USE_HIPBLAS)
-            int device;
-            CU_CHECK(DPCT_CHECK_ERROR(device = id));
-            /*
-            DPCT1028:74: The cuDeviceGetAttribute was not migrated because
-            parameter CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED is
-            unsupported.
-            */
-            CU_CHECK(cuDeviceGetAttribute(
-                &device_vmm,
-                CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
-                device));
-
-            if (device_vmm) {
-                /*
-                DPCT1082:75: Migration of CUmemAllocationProp type is not
-                supported.
-                */
-                CUmemAllocationProp alloc_prop = {};
-                alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
-                alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
-                alloc_prop.location.id = id;
-                /*
-                DPCT1007:76: Migration of cuMemGetAllocationGranularity is not
-                supported.
-                */
-                CU_CHECK(cuMemGetAllocationGranularity(
-                    &g_device_caps[id].vmm_granularity, &alloc_prop,
-                    CU_MEM_ALLOC_GRANULARITY_MINIMUM));
-            }
-#endif // !defined(GGML_USE_HIPBLAS)
-            g_device_caps[id].vmm = !!device_vmm;
-
-            dpct::device_info prop;
-            CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-                prop, dpct::dev_mgr::instance().get_device(id))));
-            /*
-            DPCT1005:77: The SYCL device version is different from CUDA Compute
-            Compatibility. You may need to rewrite this code.
-            */
-            fprintf(stderr,
-                    "  Device %d: %s, compute capability %d.%d, VMM: %s\n", id,
-                    prop.get_name(), prop.get_major_version(),
-                    prop.get_minor_version(), device_vmm ? "yes" : "no");
-
-            g_tensor_split[id] = total_vram;
-            total_vram += prop.get_global_mem_size();
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-            g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
-#else
-            /*
-            DPCT1005:78: The SYCL device version is different from CUDA Compute
-            Compatibility. You may need to rewrite this code.
-            */
-            g_device_caps[id].cc =
-                100 * prop.get_major_version() + 10 * prop.get_minor_version();
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-        }
-        for (int id = 0; id < g_device_count; ++id) {
-            g_tensor_split[id] /= total_vram;
-        }
-
-        for (int id = 0; id < g_device_count; ++id) {
-            CUDA_CHECK(ggml_cuda_set_device(id));
-
-            // create cuda streams
-            for (int is = 0; is < MAX_STREAMS; ++is) {
-                /*
-                DPCT1025:79: The SYCL queue is created ignoring the flag and
-                priority options.
-                */
-                CUDA_CHECK(DPCT_CHECK_ERROR(
-                    g_cudaStreams[id][is] =
-                        dpct::get_current_device().create_queue()));
-            }
-
-            // create cublas handle
-            CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
-                                              &dpct::get_in_order_queue()));
-            /*
-            DPCT1027:80: The call to cublasSetMathMode was replaced with 0
-            because this functionality is redundant in SYCL.
-            */
-            CUBLAS_CHECK(0);
-        }
-
-        // configure logging to stdout
-        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
-
-        initialized = true;
-        g_cublas_loaded = true;
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_set_tensor_split(const float * tensor_split) {
-    if (tensor_split == nullptr) {
-        return;
-    }
-    bool all_zero = true;
-    for (int i = 0; i < g_device_count; ++i) {
-        if (tensor_split[i] != 0.0f) {
-            all_zero = false;
-            break;
-        }
-    }
-    if (all_zero) {
-        return;
-    }
-    float split_sum = 0.0f;
-    for (int i = 0; i < g_device_count; ++i) {
-        g_tensor_split[i] = split_sum;
-        split_sum += tensor_split[i];
-    }
-    for (int i = 0; i < g_device_count; ++i) {
-        g_tensor_split[i] /= split_sum;
-    }
-}
-
-void *ggml_cuda_host_malloc(size_t size) try {
-    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
-        return nullptr;
-    }
-
-    void * ptr = nullptr;
-    dpct::err0 err = DPCT_CHECK_ERROR(
-        ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
-    /*
-    DPCT1000:82: Error handling if-stmt was detected but could not be rewritten.
-    */
-    if (err != 0) {
-        // clear the error
-        /*
-        DPCT1026:83: The call to cudaGetLastError was removed because this
-        functionality is redundant in SYCL.
-        */
-        /*
-        DPCT1001:81: The statement could not be removed.
-        */
-        fprintf(
-            stderr,
-            "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
-            /*
-            DPCT1009:84: SYCL uses exceptions to report errors and does not use
-            the error codes. The original code was commented out and a warning
-            string was inserted. You need to rewrite this code.
-            */
-            size / 1024.0 / 1024.0,
-            "cudaGetErrorString is not supported" /*cudaGetErrorString(err)*/);
-        return nullptr;
-    }
-
-    return ptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_host_free(void *ptr) try {
-    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
-                                          const struct ggml_tensor *src,
-                                          int64_t i3, int64_t i2,
-                                          int64_t i1_low, int64_t i1_high,
-                                          dpct::queue_ptr stream) try {
-
-    dpct::memcpy_direction kind;
-    char * src_ptr;
-    if (src->backend == GGML_BACKEND_CPU) {
-        kind = dpct::host_to_device;
-        src_ptr = (char *) src->data;
-    } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
-        GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
-        kind = dpct::device_to_device;
-        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
-        int id;
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            id = dpct::dev_mgr::instance().current_device_id()));
-        src_ptr = (char *) extra->data_device[id];
-    } else {
-        GGML_ASSERT(false);
-    }
-    char * dst_ptr = (char *) dst;
-
-    const int64_t ne0 = src->ne[0];
-    const int64_t nb0 = src->nb[0];
-    const int64_t nb1 = src->nb[1];
-    const int64_t nb2 = src->nb[2];
-    const int64_t nb3 = src->nb[3];
-    const enum ggml_type type = src->type;
-    const int64_t ts = ggml_type_size(type);
-    const int64_t bs = ggml_blck_size(type);
-    int64_t i1_diff = i1_high - i1_low;
-
-    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == ts*ne0/bs) {
-        return DPCT_CHECK_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
-    } else if (nb0 == ts) {
-        return DPCT_CHECK_ERROR(
-            dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1,
-                                    ts * ne0 / bs, i1_diff, kind, *stream));
-    } else {
-        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
-            const void * rx = (const void *) ((const char *) x + i1*nb1);
-            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
-            // pretend the row is a matrix with cols=1
-            dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
-                rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
-            /*
-            DPCT1001:85: The statement could not be removed.
-            */
-            /*
-            DPCT1000:86: Error handling if-stmt was detected but could not be
-            rewritten.
-            */
-            if (r != 0) return r;
-        }
-        return 0;
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_op_get_rows(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_d, const float *src1_d,
-                                  float *dst_d, const dpct::queue_ptr &stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
-    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
-
-    const int32_t * src1_i32 = (const int32_t *) src1_d;
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            get_rows_cuda_float(src0, src1, dst, (const sycl::half *)src0_d,
-                                src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_F32:
-            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_0:
-            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        default:
-            // TODO: k-quants
-            fprintf(stderr, "%s: unsupported type: %s\n", __func__, ggml_type_name(src0->type));
-            GGML_ASSERT(false);
-            break;
-    }
-}
-
-template <class op>
-inline void ggml_cuda_op_bin_bcast(const ggml_tensor *src0,
-                                   const ggml_tensor *src1, ggml_tensor *dst,
-                                   const float *src0_dd, const float *src1_dd,
-                                   float *dst_dd,
-                                   const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
-             (sycl::half *)dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
-             main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_cuda_op_repeat(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_d, const float *src1_d,
-                                float *dst_d,
-                                const dpct::queue_ptr &main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
-
-    (void) src1;
-    (void) src1_d;
-}
-
-inline void ggml_cuda_op_add(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_cuda_op_acc(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
-
-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
-
-    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
-
-    (void) dst;
-}
-
-inline void ggml_cuda_op_mul(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_cuda_op_div(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_cuda_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_silu(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_gelu_quick(const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_relu(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_leaky_relu(const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_norm(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_group_norm(const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int num_groups = dst->op_params[0];
-    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_concat(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_dd, const float *src1_dd,
-                                float *dst_dd,
-                                const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
-    }
-
-    (void) src1;
-    (void) dst;
-}
-
-inline void ggml_cuda_op_upscale(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const float *src0_dd, const float *src1_dd,
-                                 float *dst_dd,
-                                 const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    const int scale_factor = dst->op_params[0];
-
-    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    pad_f32_cuda(src0_dd, dst_dd,
-        src0->ne[0], src0->ne[1], src0->ne[2],
-        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_rms_norm(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_mul_mat_q(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) try {
-
-    const int64_t ne00 = src0->ne[0];
-
-    const int64_t ne10 = src1->ne[0];
-    GGML_ASSERT(ne10 % QK8_1 == 0);
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
-    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static int64_t get_row_rounding(ggml_type type) {
-    int64_t min_compute_capability = INT_MAX;
-    int64_t max_compute_capability = INT_MIN;
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
-            if (min_compute_capability > g_device_caps[id].cc) {
-                min_compute_capability = g_device_caps[id].cc;
-            }
-            if (max_compute_capability < g_device_caps[id].cc) {
-                max_compute_capability = g_device_caps[id].cc;
-            }
-        }
-    }
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
-        case GGML_TYPE_Q3_K:
-            return min_compute_capability < CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#else
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q6_K:
-            return 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-}
-
-inline void ggml_cuda_op_mul_mat_vec_q(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) {
-
-    GGML_ASSERT(ggml_nrows(src1) == 1);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-inline void ggml_cuda_op_dequantize_mul_mat_vec(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-#ifdef GGML_CUDA_F16
-    cuda_pool_alloc<half> src1_dfloat_a;
-    half * src1_dfloat = nullptr; // dfloat == half
-
-    bool src1_convert_f16 =
-        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
-        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
-
-    if (src1_convert_f16) {
-        src1_dfloat = src1_dfloat_a.alloc(ne00);
-        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
-                                ne00, 1, sizeof(float), 0, 0,
-                                ne00, 1, sizeof(half),  0, 0, stream);
-    }
-#else
-    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
-#endif // GGML_CUDA_F16
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-inline void ggml_cuda_op_mul_mat_cublas(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) try {
-
-    GGML_ASSERT(src0_dd_i  != nullptr);
-    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_dd_i   != nullptr);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int id;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(id = dpct::dev_mgr::instance().current_device_id()));
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // ldc == nrows of the matrix that cuBLAS writes into
-    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
-
-    const int compute_capability = g_device_caps[id].cc;
-
-    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
-        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        cuda_pool_alloc<sycl::half> src0_as_f16;
-        if (src0->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
-            size_t ne = row_diff*ne00;
-            src0_as_f16.alloc(ne);
-            to_fp16_cuda(src0_dd_i, src0_as_f16.get(), ne, stream);
-        }
-        const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
-                                         ? (const sycl::half *)src0_dd_i
-                                         : src0_as_f16.get();
-
-        cuda_pool_alloc<sycl::half> src1_as_f16;
-        if (src1->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
-            size_t ne = src1_ncols*ne10;
-            src1_as_f16.alloc(ne);
-            to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream);
-        }
-        const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
-                                         ? (const sycl::half *)src1_ddf_i
-                                         : src1_as_f16.get();
-        cuda_pool_alloc<sycl::half> dst_f16(row_diff * src1_ncols);
-
-        const sycl::half alpha_f16 = 1.0f;
-        const sycl::half beta_f16 = 0.0f;
-
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
-            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
-            &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
-            src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
-            dst_f16.get(), dpct::library_data_t::real_half, ldc,
-            dpct::library_data_t::real_half)));
-
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
-    }
-    else {
-        cuda_pool_alloc<float> src0_ddq_as_f32;
-
-        if (src0->type != GGML_TYPE_F32) {
-            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
-            GGML_ASSERT(to_fp32_cuda != nullptr);
-            src0_ddq_as_f32.alloc(row_diff*ne00);
-            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream);
-        }
-        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get();
-
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
-            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
-            dpct::get_value(&alpha, *g_cublas_handles[id]), src0_ddf_i, ne00,
-            src1_ddf_i, ne10, dpct::get_value(&beta, *g_cublas_handles[id]),
-            dst_dd_i, ldc)));
-    }
-
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_padded_row_size;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
-    GGML_ASSERT(src0->type == dst->type);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past      = ((int32_t *) dst->op_params)[0];
-    const int n_dims      = ((int32_t *) dst->op_params)[1];
-    const int mode        = ((int32_t *) dst->op_params)[2];
-    const int n_ctx       = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
-
-    // RoPE alteration for extended context
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-    const int32_t * pos = nullptr;
-    if ((mode & 1) == 0) {
-        GGML_ASSERT(src1->type == GGML_TYPE_I32);
-        GGML_ASSERT(src1->ne[0] == ne2);
-        pos = (const int32_t *) src1_dd;
-    }
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    rope_corr_dims corr_dims;
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
-
-    // compute
-    if (is_glm) {
-        GGML_ASSERT(false);
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
-    } else if (is_neox) {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
-                           ne00, n_dims, nrows, pos, freq_scale, ne01,
-                           freq_base, ext_factor, attn_factor, corr_dims,
-                           main_stream);
-        } else {
-            GGML_ASSERT(false);
-        }
-    } else {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
-                      nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                      attn_factor, corr_dims, main_stream);
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
-                               ggml_tensor *dst, const float *src0_dd,
-                               const float *src1_dd, float *dst_dd,
-                               const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_head = ((int32_t *) dst->op_params)[1];
-    float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-    //GGML_ASSERT(ne01 + n_past == ne00);
-    GGML_ASSERT(n_head == ne02);
-
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
-
-    (void) src1;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_im2col(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_dd, const float *src1_dd,
-                                float *dst_dd,
-                                const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
-
-    const int64_t IC = src1->ne[is_2D ? 2 : 1];
-    const int64_t IH = is_2D ? src1->ne[1] : 1;
-    const int64_t IW =         src1->ne[0];
-
-    const int64_t KH = is_2D ? src0->ne[1] : 1;
-    const int64_t KW =         src0->ne[0];
-
-    const int64_t OH = is_2D ? dst->ne[2] : 1;
-    const int64_t OW =         dst->ne[1];
-
-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-
-    im2col_f32_f16_cuda(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
-                        IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
-
-    (void) src0;
-    (void) src0_dd;
-}
-
-inline void ggml_cuda_op_sum_rows(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_argsort(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const float *src0_dd, const float *src1_dd,
-                                 float *dst_dd,
-                                 const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_diag_mask_inf(const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst, const float *src0_dd,
-                                       const float *src1_dd, float *dst_dd,
-                                       const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int nrows0 = ggml_nrows(src0);
-
-    const int n_past = ((int32_t *) dst->op_params)[0];
-
-    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_soft_max(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows_x = ggml_nrows(src0);
-    const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
-
-    float scale = 1.0f;
-    memcpy(&scale, dst->op_params, sizeof(float));
-
-    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
-
-    (void) dst;
-}
-
-inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
-                               ggml_tensor *dst, const float *src0_dd,
-                               const float *src1_dd, float *dst_dd,
-                               const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float scale;
-    memcpy(&scale, dst->op_params, sizeof(float));
-
-    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
-    /*
-    DPCT1010:87: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    CUDA_CHECK(0);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
-                               ggml_tensor *dst, const float *src0_dd,
-                               const float *src1_dd, float *dst_dd,
-                               const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
-    /*
-    DPCT1010:88: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    CUDA_CHECK(0);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_flatten(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const ggml_cuda_op_flatten_t op) try {
-    const int64_t nrows0 = ggml_nrows(src0);
-
-    const bool use_src1 = src1 != nullptr;
-    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
-
-    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
-    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_GPU;
-
-    // dd = data device
-    float * src0_ddf = nullptr;
-    float * src1_ddf = nullptr;
-    float *  dst_ddf = nullptr;
-
-    cuda_pool_alloc<float> src0_f;
-    cuda_pool_alloc<float> src1_f;
-    cuda_pool_alloc<float>  dst_f;
-
-    ggml_cuda_set_device(g_main_device);
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    if (src0_on_device) {
-        src0_ddf = (float *) src0_extra->data_device[g_main_device];
-    } else {
-        src0_ddf = src0_f.alloc(ggml_nelements(src0));
-        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
-    }
-
-    if (use_src1) {
-        if (src1_on_device) {
-            src1_ddf = (float *) src1_extra->data_device[g_main_device];
-        } else {
-            src1_ddf = src1_f.alloc(ggml_nelements(src1));
-            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
-        }
-    }
-    if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[g_main_device];
-    } else {
-        dst_ddf = dst_f.alloc(ggml_nelements(dst));
-    }
-
-    // do the computation
-    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    /*
-    DPCT1010:89: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    CUDA_CHECK(0);
-
-    // copy dst to host if necessary
-    if (!dst_on_device) {
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
-    }
-
-    if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            dpct::get_current_device().queues_wait_and_throw()));
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_set_peer_access(const int n_tokens) {
-    static bool peer_access_enabled = false;
-
-    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
-
-    if (peer_access_enabled == enable_peer_access) {
-        return;
-    }
-
-#ifdef NDEBUG
-    for (int id = 0; id < g_device_count; ++id) {
-        CUDA_CHECK(ggml_cuda_set_device(id));
-        CUDA_CHECK(cudaDeviceSynchronize());
-    }
-
-    for (int id = 0; id < g_device_count; ++id) {
-        CUDA_CHECK(ggml_cuda_set_device(id));
-
-        for (int id_other = 0; id_other < g_device_count; ++id_other) {
-            if (id == id_other) {
-                continue;
-            }
-            if (id != g_main_device && id_other != g_main_device) {
-                continue;
-            }
-
-            int can_access_peer;
-            CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
-            if (can_access_peer) {
-                if (enable_peer_access) {
-                    CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
-                } else {
-                    CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
-                }
-            }
-        }
-    }
-#endif // NDEBUG
-
-    peer_access_enabled = enable_peer_access;
-}
-
-static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 ggml_cuda_op_mul_mat_t op,
-                                 const bool convert_src1_to_q8_1) try {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-    const int64_t nrows0 = ggml_nrows(src0);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int64_t nrows1 = ggml_nrows(src1);
-
-    GGML_ASSERT(ne03 == ne13);
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
-
-    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
-
-    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
-
-    const int64_t i02_divisor = ne12 / ne02;
-
-    const size_t src0_ts = ggml_type_size(src0->type);
-    const size_t src0_bs = ggml_blck_size(src0->type);
-    const size_t q8_1_ts = sizeof(block_q8_1);
-    const size_t q8_1_bs = QK8_1;
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
-    const bool src0_is_contiguous = ggml_is_contiguous(src0);
-    const bool src1_is_contiguous = ggml_is_contiguous(src1);
-
-    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
-
-    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
-    GGML_ASSERT(!(split && ne02 > 1));
-    GGML_ASSERT(!(split && ne03 > 1));
-    GGML_ASSERT(!(split && ne02 < ne12));
-
-    // dd = data device
-    char  *  src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
-    float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
-    char  * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
-    float *   dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
-
-    // as = actual size
-    size_t  src0_as[GGML_CUDA_MAX_DEVICES] = {0};
-    size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
-    size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
-    size_t   dst_as[GGML_CUDA_MAX_DEVICES] = {0};
-
-    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
-    int64_t row_high[GGML_CUDA_MAX_DEVICES];
-
-    int used_devices = 0;
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        // by default, use all rows
-        row_low[id]  = 0;
-        row_high[id] = ne01;
-
-        // for multi GPU, get the row boundaries from tensor split
-        // and round to mul_mat_q tile sizes
-        if (split) {
-            const int64_t rounding = get_row_rounding(src0->type);
-
-            if (id != 0) {
-                row_low[id]  = ne01*g_tensor_split[id];
-                if (row_low[id] < ne01) {
-                    row_low[id] -= row_low[id] % rounding;
-                }
-            }
-
-            if (id != g_device_count - 1) {
-                row_high[id]  = ne01*g_tensor_split[id + 1];
-                if (row_high[id] < ne01) {
-                    row_high[id] -= row_high[id] % rounding;
-                }
-            }
-        }
-    }
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
-            continue;
-        }
-
-        used_devices++;
-
-        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
-        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
-
-        ggml_cuda_set_device(id);
-        const dpct::queue_ptr stream = g_cudaStreams[id][0];
-
-        if (src0_on_device && src0_is_contiguous) {
-            src0_dd[id] = (char *) src0_extra->data_device[id];
-        } else {
-            // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
-            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
-        }
-
-        if (src1_on_device && src1_is_contiguous) {
-            src1_ddf[id] = (float *) src1_extra->data_device[id];
-        } else {
-            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
-        }
-
-        if (convert_src1_to_q8_1) {
-            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
-
-            if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
-                /*
-                DPCT1010:90: SYCL uses exceptions to report errors and does not
-                use the error codes. The call was replaced with 0. You need to
-                rewrite this code.
-                */
-                CUDA_CHECK(0);
-            }
-        }
-
-        if (dst_on_device) {
-            dst_dd[id] = (float *) dst_extra->data_device[id];
-        } else {
-            const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
-            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
-        }
-    }
-
-    // if multiple devices are used they need to wait for the main device
-    // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && used_devices > 1) {
-        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-        /*
-        DPCT1024:91: The original code returned the error code that was further
-        consumed by the program logic. This original code was replaced with 0.
-        You may need to rewrite the program logic consuming the error code.
-        */
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            *src0_extra->events[g_main_device][0] =
-                g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier()));
-    }
-
-    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
-    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
-        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
-        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
-
-        for (int64_t id = 0; id < g_device_count; ++id) {
-            if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
-                continue;
-            }
-
-            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
-            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
-            const int64_t row_diff = row_high[id] - row_low[id];
-
-            ggml_cuda_set_device(id);
-            const dpct::queue_ptr stream = g_cudaStreams[id][is];
-
-            // wait for main GPU data if necessary
-            if (split && (id != g_main_device || is != 0)) {
-                CUDA_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
-                    {*src0_extra->events[g_main_device][0]})));
-            }
-
-            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
-                const int64_t i03 = i0 / ne12;
-                const int64_t i02 = i0 % ne12;
-
-                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
-
-                // for split tensors the data begins at i0 == i0_offset_low
-                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
-                float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
-                char  * src1_ddq_i = src1_ddq[id] +  src1_ddq_i_offset;
-                float *   dst_dd_i =   dst_dd[id] + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
-
-                // the main device memory buffer can be on VRAM scratch, with space for all partial results
-                // in that case an offset on dst_ddf_i is needed
-                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
-                    dst_dd_i += row_low[id]; // offset is 0 if no tensor split
-                }
-
-                // copy src0, src1 to device if necessary
-                if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
-                    if (id != g_main_device) {
-                        if (convert_src1_to_q8_1) {
-                            char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
-                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
-                                src1_ddq_i, src1_ddq_i_source,
-                                src1_ncols * src1_padded_col_size * q8_1_ts /
-                                    q8_1_bs)));
-                        } else {
-                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
-                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
-                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
-                                src1_ddf_i, src1_ddf_i_source,
-                                src1_ncols * ne10 * sizeof(float))));
-                        }
-                    }
-                } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
-                                   src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
-                } else {
-                    GGML_ASSERT(false);
-                }
-
-                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
-                    /*
-                    DPCT1010:92: SYCL uses exceptions to report errors and does
-                    not use the error codes. The call was replaced with 0. You
-                    need to rewrite this code.
-                    */
-                    CUDA_CHECK(0);
-                }
-
-                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
-                }
-
-                // do the computation
-                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
-                   row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
-                /*
-                DPCT1010:93: SYCL uses exceptions to report errors and does not
-                use the error codes. The call was replaced with 0. You need to
-                rewrite this code.
-                */
-                CUDA_CHECK(0);
-
-                // copy dst to host or other device if necessary
-                if (!dst_on_device) {
-                    void * dst_off_device;
-                    dpct::memcpy_direction kind;
-                    if (dst->backend == GGML_BACKEND_CPU) {
-                        dst_off_device = dst->data;
-                        kind = dpct::device_to_host;
-                    } else if (dst->backend == GGML_BACKEND_GPU) {
-                        dst_off_device = dst_extra->data_device[g_main_device];
-                        kind = dpct::device_to_device;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                    if (split) {
-                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
-                        // dst is NOT transposed.
-                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
-                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
-                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0 + row_low[id];
-                        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
-                            dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
-                            row_diff * sizeof(float), row_diff * sizeof(float),
-                            src1_ncols, kind, *stream)));
-                    } else {
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0;
-                        CUDA_CHECK(DPCT_CHECK_ERROR(
-                            stream->memcpy(dhf_dst_i, dst_dd_i,
-                                           src1_ncols * ne0 * sizeof(float))));
-                    }
-                }
-
-                // add event for the main device to wait on until other device is done
-                if (split && (id != g_main_device || is != 0)) {
-                    /*
-                    DPCT1024:94: The original code returned the error code that
-                    was further consumed by the program logic. This original
-                    code was replaced with 0. You may need to rewrite the
-                    program logic consuming the error code.
-                    */
-                    CUDA_CHECK(DPCT_CHECK_ERROR(
-                        *src0_extra->events[id][is] =
-                            stream->ext_oneapi_submit_barrier()));
-                }
-            }
-        }
-    }
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
-            continue;
-        }
-        CUDA_CHECK(ggml_cuda_set_device(id));
-
-        // free buffers again when done
-        if (dst_as[id] > 0) {
-            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
-        }
-        if (src1_asq[id] > 0) {
-            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
-        }
-        if (src1_asf[id] > 0) {
-            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
-        }
-        if (src0_as[id] > 0) {
-            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
-        }
-    }
-
-    // main device waits for all other devices to be finished
-    if (split && g_device_count > 1) {
-        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
-        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
-
-        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-        for (int64_t id = 0; id < g_device_count; ++id) {
-            if (row_low[id] == row_high[id]) {
-                continue;
-            }
-            for (int64_t is = 0; is < is_max; ++is) {
-                CUDA_CHECK(DPCT_CHECK_ERROR(
-                    g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier(
-                        {*src0_extra->events[id][is]})));
-            }
-        }
-    }
-
-    if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            dpct::get_current_device().queues_wait_and_throw()));
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
-}
-
-static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
-}
-
-static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
-}
-
-static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
-}
-
-static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
-}
-
-static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
-}
-
-static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
-}
-
-static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
-}
-
-static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
-}
-
-static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
-}
-
-static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
-}
-
-static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
-}
-
-static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
-}
-
-static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
-}
-
-static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
-}
-
-static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
-}
-
-static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
-}
-
-static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
-}
-
-static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
-}
-
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (!g_cublas_loaded) return false;
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-            src1->type == GGML_TYPE_F32 &&
-             dst->type == GGML_TYPE_F32 &&
-            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
-}
-
-static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst) try {
-    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
-    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
-                                     const ggml_tensor *src1,
-                                     ggml_tensor *dst) try {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    const int64_t row_stride_x = nb01 / sizeof(sycl::half);
-    const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
-
-    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
-                                   const sycl::half *src1_as_f16, char *dst,
-                                   const void **ptrs_src, void **ptrs_dst,
-                                   int64_t ne12, int64_t ne13, int64_t ne23,
-                                   size_t nb02, size_t nb03, size_t nb12,
-                                   size_t nb13, size_t nbd2, size_t nbd3,
-                                   int64_t r2, int64_t r3,
-                                   const sycl::nd_item<3> &item_ct1) {
-    int64_t i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                  item_ct1.get_local_id(2);
-    int64_t i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
-                  item_ct1.get_local_id(1);
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int64_t i03 = i13 / r3;
-    int64_t i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)         dst + i12*nbd2   + i13*nbd3;
-}
-
-static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
-                                                 const ggml_tensor *src1,
-                                                 ggml_tensor *dst) try {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
-    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
-    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    CUBLAS_CHECK(
-        DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream));
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-    sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
-
-    cuda_pool_alloc<sycl::half> src1_as_f16(ne1);
-    to_fp16_cuda(src1_ddf, src1_as_f16.get(), ne1, main_stream);
-
-    cuda_pool_alloc<sycl::half> dst_f16;
-    char * dst_t;
-
-    dpct::library_data_t cu_compute_type = CUBLAS_COMPUTE_16F;
-    dpct::library_data_t cu_data_type = dpct::library_data_t::real_half;
-
-    // dst strides
-    size_t nbd2 = dst->nb[2];
-    size_t nbd3 = dst->nb[3];
-
-    const sycl::half alpha_f16 = 1.0f;
-    const sycl::half beta_f16 = 0.0f;
-
-    const float alpha_f32 = 1.0f;
-    const float beta_f32  = 0.0f;
-
-    const void * alpha = &alpha_f16;
-    const void * beta  = &beta_f16;
-
-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        dst_t = (char *) dst_f16.alloc(ne);
-
-        nbd2 /= sizeof(float) / sizeof(sycl::half);
-        nbd3 /= sizeof(float) / sizeof(sycl::half);
-    } else {
-        dst_t = (char *) dst_ddf;
-
-        cu_compute_type = CUBLAS_COMPUTE_32F;
-        cu_data_type = dpct::library_data_t::real_float;
-
-        alpha = &alpha_f32;
-        beta  = &beta_f32;
-    }
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-#if 0
-    // use cublasGemmEx
-    {
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                int i03 = i13 / r3;
-                int i02 = i12 / r2;
-
-                CUBLAS_CHECK(
-                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            alpha, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F,   nb01/sizeof(half),
-                                   (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F,   nb11/sizeof(float),
-                            beta,  (      char *)       dst_t + i12*nbd2          + i13*nbd3,          cu_data_type, ne01,
-                            cu_compute_type,
-                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-            }
-        }
-    }
-#else
-    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
-        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-        // use cublasGemmStridedBatchedEx
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
-            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
-            (const char *)src0_as_f16, dpct::library_data_t::real_half,
-            nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
-            (const char *)src1_as_f16.get(), dpct::library_data_t::real_half,
-            nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta,
-            (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float),
-            ne12 * ne13, cu_compute_type)));
-    } else {
-        // use cublasGemmBatchedEx
-        const int ne23 = ne12*ne13;
-
-        cuda_pool_alloc<const void *> ptrs_src(2*ne23);
-        cuda_pool_alloc<      void *> ptrs_dst(1*ne23);
-
-        sycl::range<3> block_dims(1, ne12, ne13);
-        /*
-        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(main_stream->get_device(),
-                                         {sycl::aspect::fp16});
-
-            main_stream->submit([&](sycl::handler &cgh) {
-                const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get();
-                const void **ptrs_src_get_ct3 = ptrs_src.get();
-                void **ptrs_dst_get_ct4 = ptrs_dst.get();
-
-                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
-                                 [=](sycl::nd_item<3> item_ct1) {
-                                     k_compute_batched_ptrs(
-                                         src0_as_f16, src1_as_f16_get_ct1,
-                                         dst_t, ptrs_src_get_ct3,
-                                         ptrs_dst_get_ct4, ne12, ne13, ne23,
-                                         nb02, nb03, nb12, nb13, nbd2, nbd3, r2,
-                                         r3, item_ct1);
-                                 });
-            });
-        }
-        /*
-        DPCT1010:95: SYCL uses exceptions to report errors and does not use the
-        error codes. The call was replaced with 0. You need to rewrite this
-        code.
-        */
-        CUDA_CHECK(0);
-
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
-            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
-            (const void **)(ptrs_src.get() + 0 * ne23),
-            dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
-            (const void **)(ptrs_src.get() + 1 * ne23),
-            dpct::library_data_t::real_half, nb11 / sizeof(float), beta,
-            (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23,
-            cu_compute_type)));
-    }
-#endif
-
-    if (dst->op_params[0] == GGML_PREC_DEFAULT) {
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16.get(), dst_ddf, ne, main_stream);
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
-        (src1->backend == GGML_BACKEND_GPU) &&
-        ( dst->backend == GGML_BACKEND_GPU);
-
-    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
-
-    int64_t min_compute_capability = INT_MAX;
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
-            min_compute_capability = g_device_caps[id].cc;
-        }
-    }
-
-#ifdef CUDA_USE_TENSOR_CORES
-    const bool use_tensor_cores = true;
-#else
-    const bool use_tensor_cores = false;
-#endif
-
-    // debug helpers
-    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-
-    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // KQ single-batch
-        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-        // KQV single-batch
-        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
-        // KQ + KQV multi-batch
-        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
-    } else if (src0->type == GGML_TYPE_F32) {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
-    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
-#ifdef GGML_CUDA_FORCE_DMMV
-            const bool use_mul_mat_vec_q = false;
-#else
-            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
-#endif // GGML_CUDA_FORCE_DMMV
-
-            if (use_mul_mat_vec_q) {
-                // NOTE: this kernel does not support ggml_nrows(src1) > 1
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
-            } else {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
-            }
-        } else {
-            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
-
-            // when tensor cores are available, use them for large batch size
-            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
-                use_mul_mat_q = false;
-            }
-
-            if (use_mul_mat_q) {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
-            } else {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
-            }
-        }
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-#if 0
-template<typename ... Srcs>
-static __global__ void k_compute_batched_ptrs_id(
-        const void ** ptrs_src, void ** ptrs_dst,
-        int ne12, int ne13,
-        int ne23,
-        int nb02, int nb03,
-        int nb12, int nb13,
-        int nb2, int nb3,
-        int r2, int r3,
-        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
-        const half * src1_f16, half * dst_f16,
-        const int32_t * ids, const int id,
-        Srcs... src0s) {
-
-    int i = ids[id];
-
-    half * src0_f16;
-    const void * srcs_ar[] = { (const half *) src0s... };
-    if (src0_type == GGML_TYPE_F16) {
-        src0_f16 = (half *) srcs_ar[i];
-    } else {
-        src0_f16 = src0_as_f16;
-        if (threadIdx.x == 0 && threadIdx.y == 0) {
-            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
-            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
-        }
-    }
-
-    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int i03 = i13 / r3;
-    int i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
-}
-
-static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
-    const struct ggml_tensor * ids = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src00 = dst->src[2];
-
-    const int id = dst->op_params[0];
-
-    GGML_ASSERT(!ggml_is_transposed(src00));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
-    const int64_t ne01 = src00->ne[1];
-    const int64_t ne02 = src00->ne[2];
-    const int64_t ne03 = src00->ne[3];
-
-    //const int64_t nb01 = src00->nb[1];
-    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
-    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    //const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
-    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
-
-    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    //void * src0_ddq = src0_extra->data_device[g_main_device];
-    //half * src0_as_f16 = (half *) src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
-
-    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
-    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
-
-    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    const half alpha_f16 = 1.0f;
-    const half beta_f16  = 0.0f;
-
-    // use cublasGemmBatchedEx
-    const int ne23 = ne12*ne13;
-
-    const void ** ptrs_src = nullptr;
-          void ** ptrs_dst = nullptr;
-
-    size_t ptrs_src_s = 0;
-    size_t ptrs_dst_s = 0;
-
-    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
-
-    int64_t src0_ne = ggml_nelements(src00);
-    half * src0_as_f16 = nullptr;
-    size_t src0_as = 0;
-    if (src00->type != GGML_TYPE_F16) {
-        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
-    }
-
-    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
-    dim3 block_dims(ne13, ne12);
-    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
-            ptrs_src, ptrs_dst,
-            ne12, ne13,
-            ne23,
-            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
-            nb12, nb13,
-            dst->nb[2], dst->nb[3],
-            r2, r3,
-            src00->type, src0_as_f16, src0_ne,
-            src1_as_f16, dst_f16,
-            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
-            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    CUBLAS_CHECK(
-    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-            ne01, ne11, ne10,
-            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
-                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
-            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
-            ne23,
-            CUBLAS_COMPUTE_16F,
-            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-    if (src0_as != 0) {
-        ggml_cuda_pool_free(src0_as_f16, src0_as);
-    }
-    if (ptrs_src_s != 0) {
-        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
-    }
-    if (ptrs_dst_s != 0) {
-        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
-    }
-
-    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-
-    ggml_cuda_pool_free(src1_as_f16, src1_as);
-    ggml_cuda_pool_free(dst_f16, dst_as);
-}
-#endif
-
-static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
-                                 const ggml_tensor *src1,
-                                 ggml_tensor *dst) try {
-#if 0
-    ggml_cuda_mul_mat_id_cublas(dst);
-    // TODO: mmq/mmv support
-#endif
-
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb1  =  dst->nb[1];
-
-    const struct ggml_tensor * ids = src0;
-    const int32_t id = ((int32_t *) dst->op_params)[0];
-    const int32_t n_as = ((int32_t *) dst->op_params)[1];
-
-    std::vector<char> ids_host(ggml_nbytes(ids));
-
-    const dpct::queue_ptr stream = g_cudaStreams[g_main_device][0];
-
-    if (ids->backend == GGML_BACKEND_GPU) {
-        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids))));
-        CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
-    } else {
-        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
-    }
-
-    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
-    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
-
-    ggml_tensor_extra_gpu src1_row_extra;
-    ggml_tensor_extra_gpu dst_row_extra;
-
-    ggml_tensor src1_row = *src1;
-    ggml_tensor dst_row = *dst;
-
-    src1_row.backend = GGML_BACKEND_GPU;
-    dst_row.backend  = GGML_BACKEND_GPU;
-
-    src1_row.extra = &src1_row_extra;
-    dst_row.extra = &dst_row_extra;
-
-    char * src1_original = src1->backend == GGML_BACKEND_CPU ?
-        (char *) src1->data : (char *) src1_extra->data_device[g_main_device];
-    char * dst_original  =  dst->backend == GGML_BACKEND_CPU ?
-        (char *)  dst->data : (char *)  dst_extra->data_device[g_main_device];
-
-    if (src1->ne[1] == 1) {
-        GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
-        GGML_ASSERT(dst->backend  == GGML_BACKEND_GPU);
-
-        for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-            //int32_t row_id;
-            //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
-            //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
-
-            const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-            GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
-            src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1];
-            src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set?
-
-            dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1];
-            dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set?
-
-            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
-        }
-    } else {
-        cuda_pool_alloc<char> src1_contiguous(sizeof(float)*ggml_nelements(src1));
-        cuda_pool_alloc<char>  dst_contiguous(sizeof(float)*ggml_nelements(dst));
-
-        src1_row_extra.data_device[g_main_device] = src1_contiguous.get();
-        dst_row_extra.data_device[g_main_device]  =  dst_contiguous.get();
-
-        const dpct::memcpy_direction src1_kind =
-            src1->backend == GGML_BACKEND_CPU ? dpct::host_to_device
-                                              : dpct::device_to_device;
-        const dpct::memcpy_direction dst_kind = dst->backend == GGML_BACKEND_CPU
-                                                    ? dpct::device_to_host
-                                                    : dpct::device_to_device;
-
-        for (int32_t row_id = 0; row_id < n_as; ++row_id) {
-            const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
-            int64_t num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-                if (row_id_i != row_id) {
-                    continue;
-                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                CUDA_CHECK(DPCT_CHECK_ERROR(
-                    stream->memcpy(src1_contiguous.get() + num_src1_rows * nb11,
-                                   src1_original + i01 * nb11, nb11)));
-                num_src1_rows++;
-            }
-
-            if (num_src1_rows == 0) {
-                continue;
-            }
-
-            src1_row.ne[1] = num_src1_rows;
-            dst_row.ne[1] = num_src1_rows;
-
-            src1_row.nb[1] = nb11;
-            src1_row.nb[2] = num_src1_rows*nb11;
-            src1_row.nb[3] = num_src1_rows*nb11;
-
-            dst_row.nb[1] = nb1;
-            dst_row.nb[2] = num_src1_rows*nb1;
-            dst_row.nb[3] = num_src1_rows*nb1;
-
-            ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
-
-            num_src1_rows = 0;
-            for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-                if (row_id_i != row_id) {
-                    continue;
-                }
-
-                GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-                CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
-                    dst_original + i01 * nb1,
-                    dst_contiguous.get() + num_src1_rows * nb1, nb1)));
-                num_src1_rows++;
-            }
-        }
-    }
-
-    if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(DPCT_CHECK_ERROR(stream->wait()));
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
-}
-
-static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
-}
-
-static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
-                          ggml_tensor *dst) try {
-    const int64_t ne = ggml_nelements(src0);
-    GGML_ASSERT(ne == ggml_nelements(src1));
-
-    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
-    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
-
-    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
-    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    GGML_ASSERT(src0->ne[3] == 1);
-
-    const int64_t nb00 = src0->nb[0];
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    GGML_ASSERT(src1->ne[3] == 1);
-
-    const int64_t nb10 = src1->nb[0];
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2];
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-
-    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
-
-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
-                ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-
-    (void) dst;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    // TODO: why do we pass dst as src1 here?
-    ggml_cuda_cpy(src0, dst, nullptr);
-    (void) src1;
-}
-
-static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
-}
-
-static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
-}
-
-static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
-}
-
-static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
-}
-
-static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
-}
-
-static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
-}
-
-static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
-}
-
-static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    (void) src0;
-    (void) src1;
-    (void) dst;
-}
-
-static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
-}
-
-void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
-    const int64_t nrows = ggml_nrows(tensor);
-
-    const int64_t ne0 = tensor->ne[0];
-
-    const size_t nb1 = tensor->nb[1];
-
-    ggml_backend_type backend = tensor->backend;
-    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
-    memset(extra, 0, sizeof(*extra));
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (backend == GGML_BACKEND_GPU && id != g_main_device) {
-            continue;
-        }
-
-        ggml_cuda_set_device(id);
-
-        int64_t row_low, row_high;
-        if (backend == GGML_BACKEND_GPU) {
-            row_low = 0;
-            row_high = nrows;
-        } else if (backend == GGML_BACKEND_GPU_SPLIT) {
-            const int64_t rounding = get_row_rounding(tensor->type);
-
-            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
-            row_low -= row_low % rounding;
-
-            if (id == g_device_count - 1) {
-                row_high = nrows;
-            } else {
-                row_high = nrows*g_tensor_split[id + 1];
-                row_high -= row_high % rounding;
-            }
-        } else {
-            GGML_ASSERT(false);
-        }
-        if (row_low == row_high) {
-            continue;
-        }
-
-        int64_t nrows_split = row_high - row_low;
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        char * buf;
-        CUDA_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
-                                        size, dpct::get_in_order_queue())));
-        char * buf_host = (char *)data + offset_split;
-
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            CUDA_CHECK(DPCT_CHECK_ERROR(
-                dpct::get_in_order_queue()
-                    .memset(buf + original_size, 0, size - original_size)
-                    .wait()));
-        }
-
-        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
-                                        .memcpy(buf, buf_host, original_size)
-                                        .wait()));
-
-        extra->data_device[id] = buf;
-
-        if (backend == GGML_BACKEND_GPU_SPLIT) {
-            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-                CUDA_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
-                                                new sycl::event()));
-            }
-        }
-    }
-
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
-    if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (extra->data_device[id] != nullptr) {
-            CUDA_CHECK(ggml_cuda_set_device(id));
-            CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(
-                extra->data_device[id], dpct::get_in_order_queue())));
-        }
-
-        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-            if (extra->events[id][is] != nullptr) {
-                CUDA_CHECK(ggml_cuda_set_device(id));
-                CUDA_CHECK(DPCT_CHECK_ERROR(
-                    dpct::destroy_event(extra->events[id][is])));
-            }
-        }
-    }
-
-    delete extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
-static size_t g_temp_tensor_extra_index = 0;
-
-static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
-    if (g_temp_tensor_extras == nullptr) {
-        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
-    }
-
-    size_t alloc_index = g_temp_tensor_extra_index;
-    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
-    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
-    memset(extra, 0, sizeof(*extra));
-
-    return extra;
-}
-
-static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
-                                          bool scratch, bool force_inplace,
-                                          bool no_alloc) try {
-    if (scratch && g_scratch_size == 0) {
-        return;
-    }
-
-    tensor->backend = GGML_BACKEND_GPU;
-
-    // recursively assign CUDA buffers until a compute tensor is found
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
-        const ggml_op src0_op = tensor->src[0]->op;
-        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
-            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
-        }
-    }
-    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
-        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
-    }
-
-    if (scratch && no_alloc) {
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra;
-
-    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
-        tensor->op == GGML_OP_VIEW ||
-        force_inplace;
-    const size_t size = ggml_nbytes(tensor);
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-        size_t offset = 0;
-        if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&offset, tensor->op_params, sizeof(size_t));
-        }
-        extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = src0_ddc + offset;
-    } else if (tensor->op == GGML_OP_CPY) {
-        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
-        void * src1_ddv = src1_extra->data_device[g_main_device];
-        extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = src1_ddv;
-    } else if (scratch) {
-        GGML_ASSERT(size <= g_scratch_size);
-        if (g_scratch_offset + size > g_scratch_size) {
-            g_scratch_offset = 0;
-        }
-
-        char * data = (char *) g_scratch_buffer;
-        if (data == nullptr) {
-            CUDA_CHECK(DPCT_CHECK_ERROR(
-                data = (char *)sycl::malloc_device(
-                    g_scratch_size, dpct::get_in_order_queue())));
-            g_scratch_buffer = data;
-        }
-        extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = data + g_scratch_offset;
-
-        g_scratch_offset += size;
-
-        GGML_ASSERT(g_scratch_offset <= g_scratch_size);
-    } else { // allocate new buffers outside of scratch
-        void * data;
-        CUDA_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
-                                        size, dpct::get_in_order_queue())));
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            dpct::get_in_order_queue().memset(data, 0, size).wait()));
-        extra = new ggml_tensor_extra_gpu;
-        memset(extra, 0, sizeof(*extra));
-        extra->data_device[g_main_device] = data;
-    }
-
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
-                                     size_t offset) try {
-    if (g_scratch_size == 0) {
-        return;
-    }
-    if (g_scratch_buffer == nullptr) {
-        ggml_cuda_set_device(g_main_device);
-        CUDA_CHECK(
-            DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
-                                 g_scratch_size, dpct::get_in_order_queue())));
-    }
-
-    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
-
-    const bool inplace = tensor->view_src != nullptr;
-
-    if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-        size_t view_offset = 0;
-        if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
-        }
-        extra->data_device[g_main_device] = src0_ddc + view_offset;
-    } else {
-        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
-    }
-
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-    GGML_ASSERT(ggml_is_contiguous(tensor));
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
-                                    .memcpy(extra->data_device[g_main_device],
-                                            tensor->data, ggml_nbytes(tensor))
-                                    .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
-}
-
-void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, true, false, true);
-}
-
-void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, false, false);
-}
-
-void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, true, false);
-}
-
-void ggml_cuda_set_main_device(const int main_device) try {
-    if (main_device >= g_device_count) {
-        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
-                main_device, g_device_count, g_main_device);
-        return;
-    }
-
-    if (g_main_device != main_device && g_device_count > 1) {
-        g_main_device = main_device;
-        dpct::device_info prop;
-        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(g_main_device))));
-        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
-                g_main_device, prop.get_name());
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_set_scratch_size(const size_t scratch_size) {
-    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
-    // it still won't always work as expected, but it's better than nothing
-    if (scratch_size > g_scratch_size) {
-        ggml_cuda_free_scratch();
-    }
-    g_scratch_size = std::max(g_scratch_size, scratch_size);
-}
-
-void ggml_cuda_free_scratch() try {
-    if (g_scratch_buffer == nullptr) {
-        return;
-    }
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(
-        sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
-    g_scratch_buffer = nullptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    if (!g_cublas_loaded) return false;
-
-    ggml_cuda_func_t func;
-    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
-        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
-        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
-
-    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
-        return false;
-    }
-
-    if (tensor->op == GGML_OP_MUL_MAT) {
-        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
-#ifndef NDEBUG
-            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
-#endif
-            return false;
-        }
-    }
-
-    switch (tensor->op) {
-        case GGML_OP_REPEAT:
-            func = ggml_cuda_repeat;
-            break;
-        case GGML_OP_GET_ROWS:
-            func = ggml_cuda_get_rows;
-            break;
-        case GGML_OP_DUP:
-            func = ggml_cuda_dup;
-            break;
-        case GGML_OP_ADD:
-            func = ggml_cuda_add;
-            break;
-        case GGML_OP_ACC:
-            func = ggml_cuda_acc;
-            break;
-        case GGML_OP_MUL:
-            func = ggml_cuda_mul;
-            break;
-        case GGML_OP_DIV:
-            func = ggml_cuda_div;
-            break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(tensor)) {
-                case GGML_UNARY_OP_GELU:
-                    func = ggml_cuda_gelu;
-                    break;
-                case GGML_UNARY_OP_SILU:
-                    func = ggml_cuda_silu;
-                    break;
-                case GGML_UNARY_OP_GELU_QUICK:
-                    func = ggml_cuda_gelu_quick;
-                    break;
-                case GGML_UNARY_OP_TANH:
-                    func = ggml_cuda_tanh;
-                    break;
-                case GGML_UNARY_OP_RELU:
-                    func = ggml_cuda_relu;
-                    break;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_NORM:
-            func = ggml_cuda_norm;
-            break;
-        case GGML_OP_GROUP_NORM:
-            func = ggml_cuda_group_norm;
-            break;
-        case GGML_OP_CONCAT:
-            func = ggml_cuda_concat;
-            break;
-        case GGML_OP_UPSCALE:
-            func = ggml_cuda_upscale;
-            break;
-        case GGML_OP_PAD:
-            func = ggml_cuda_pad;
-            break;
-        case GGML_OP_LEAKY_RELU:
-            func = ggml_cuda_leaky_relu;
-            break;
-        case GGML_OP_RMS_NORM:
-            func = ggml_cuda_rms_norm;
-            break;
-        case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_cuda_mul_mat;
-            break;
-        case GGML_OP_MUL_MAT_ID:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_cuda_mul_mat_id;
-            break;
-        case GGML_OP_SCALE:
-            func = ggml_cuda_scale;
-            break;
-        case GGML_OP_SQR:
-            func = ggml_cuda_sqr;
-            break;
-        case GGML_OP_CLAMP:
-            func = ggml_cuda_clamp;
-            break;
-        case GGML_OP_CPY:
-            func = ggml_cuda_cpy;
-            break;
-        case GGML_OP_CONT:
-            func = ggml_cuda_dup;
-            break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            func = ggml_cuda_nop;
-            break;
-        case GGML_OP_DIAG_MASK_INF:
-            func = ggml_cuda_diag_mask_inf;
-            break;
-        case GGML_OP_SOFT_MAX:
-            func = ggml_cuda_soft_max;
-            break;
-        case GGML_OP_ROPE:
-            func = ggml_cuda_rope;
-            break;
-        case GGML_OP_ALIBI:
-            func = ggml_cuda_alibi;
-            break;
-        case GGML_OP_IM2COL:
-            func = ggml_cuda_im2col;
-            break;
-        case GGML_OP_SUM_ROWS:
-            func = ggml_cuda_sum_rows;
-            break;
-        case GGML_OP_ARGSORT:
-            func = ggml_cuda_argsort;
-            break;
-        default:
-            return false;
-    }
-
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT) {
-        ggml_cuda_set_peer_access(tensor->src[1]->ne[1]);
-    }
-
-    if (params->ith != 0) {
-        return true;
-    }
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return true;
-    }
-    func(tensor->src[0], tensor->src[1], tensor);
-    return true;
-}
-
-int ggml_cuda_get_device_count() try {
-    int device_count;
-    if (DPCT_CHECK_ERROR(device_count =
-                             dpct::dev_mgr::instance().device_count()) != 0) {
-        return 0;
-    }
-    return device_count;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_get_device_description(int device, char *description,
-                                      size_t description_size) try {
-    dpct::device_info prop;
-    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-        prop, dpct::dev_mgr::instance().get_device(device))));
-    snprintf(description, description_size, "%s", prop.get_name());
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// backend interface
-
-#define UNUSED GGML_UNUSED
-
-// cuda buffer
-
-struct ggml_backend_buffer_context_cuda {
-    int device;
-    void * dev_ptr = nullptr;
-    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
-    size_t temp_tensor_extra_index = 0;
-
-    ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
-
-    ~ggml_backend_buffer_context_cuda() {
-        delete[] temp_tensor_extras;
-    }
-
-    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
-        if (temp_tensor_extras == nullptr) {
-            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
-        }
-
-        size_t alloc_index = temp_tensor_extra_index;
-        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
-        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
-        memset(extra, 0, sizeof(*extra));
-
-        return extra;
-    }
-};
-
-static void
-ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
-    delete ctx;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-    return ctx->dev_ptr;
-}
-
-static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
-                                                 ggml_tensor *tensor) try {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        assert(tensor->view_src->buffer->buft == buffer->buft);
-        tensor->backend = tensor->view_src->backend;
-        tensor->extra = tensor->view_src->extra;
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
-
-    extra->data_device[ctx->device] = tensor->data;
-
-    tensor->backend = GGML_BACKEND_GPU;
-    tensor->extra = extra;
-
-    if (ggml_is_quantized(tensor->type)) {
-        // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
-        int64_t row_high = ggml_nrows(tensor);
-        int64_t nrows_split = row_high - row_low;
-
-        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
-        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
-
-        if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
-                (char *)tensor->data + original_size, 0,
-                padded_size - original_size)));
-        }
-    }
-
-    UNUSED(buffer);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer,
-                                                ggml_tensor *tensor,
-                                                const void *data, size_t offset,
-                                                size_t size) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-
-    ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
-
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(dpct::get_in_order_queue()
-                             .memcpy((char *)tensor->data + offset, data, size)
-                             .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer,
-                                                const ggml_tensor *tensor,
-                                                void *data, size_t offset,
-                                                size_t size) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-
-    ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(
-        dpct::get_in_order_queue()
-            .memcpy(data, (const char *)tensor->data + offset, size)
-            .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer,
-                                           uint8_t value) try {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-
-    ggml_cuda_set_device(ctx->device);
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(dpct::get_current_device().queues_wait_and_throw()));
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
-                                    .memset(ctx->dev_ptr, value, buffer->size)
-                                    .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
-    /* .cpy_tensor_from = */ NULL,
-    /* .cpy_tensor_to   = */ NULL,
-    /* .clear           = */ ggml_backend_cuda_buffer_clear,
-};
-
-// cuda buffer type
-
-static ggml_backend_buffer_t
-ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
-                                           size_t size) try {
-    int device = (int) (intptr_t) buft->context;
-
-    ggml_cuda_set_device(device);
-
-    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
-
-    void * dev_ptr;
-    CUDA_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
-                                    size, dpct::get_in_order_queue())));
-
-    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
-
-    return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
-
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
-    int64_t row_low = 0;
-    int64_t row_high = ggml_nrows(tensor);
-    int64_t nrows_split = row_high - row_low;
-
-    size_t size = ggml_nbytes_split(tensor, nrows_split);
-
-    int64_t ne0 = tensor->ne[0];
-
-    if (ggml_is_quantized(tensor->type)) {
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-    }
-
-    return size;
-
-    UNUSED(buft);
-}
-
-static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cuda(backend);
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
-    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
-    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
-    /* .is_host          = */ nullptr,
-};
-
-ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
-
-    static bool ggml_backend_cuda_buffer_type_initialized = false;
-
-    if (!ggml_backend_cuda_buffer_type_initialized) {
-        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
-            ggml_backend_cuda_buffer_types[i] = {
-                /* .iface    = */ ggml_backend_cuda_buffer_type_interface,
-                /* .context  = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
-            };
-        }
-        ggml_backend_cuda_buffer_type_initialized = true;
-    }
-
-    return &ggml_backend_cuda_buffer_types[device];
-}
-
-// host buffer type
-
-static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_cuda_host_free(buffer->context);
-}
-
-static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
-    void * ptr = ggml_cuda_host_malloc(size);
-
-    if (ptr == nullptr) {
-        // fallback to cpu buffer
-        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
-    }
-
-    // FIXME: this is a hack to avoid having to implement a new buffer type
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
-
-    return buffer;
-}
-
-ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
-        /* .iface    = */ {
-            /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
-            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-            /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
-        },
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_cuda_buffer_type_host;
-}
-
-// backend
-
-struct ggml_backend_context_cuda {
-    int device;
-};
-
-static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
-    return GGML_CUDA_NAME;
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_cuda_free(ggml_backend_t backend) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    delete cuda_ctx;
-    delete backend;
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
-}
-
-static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
-                                               ggml_tensor *tensor,
-                                               const void *data, size_t offset,
-                                               size_t size) try {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
-        (char *)tensor->data + offset, data, size)));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
-                                               const ggml_tensor *tensor,
-                                               void *data, size_t offset,
-                                               size_t size) try {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
-        data, (const char *)tensor->data + offset, size)));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
-
-    UNUSED(backend);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    GGML_ASSERT(!"not implemented");
-
-    return nullptr;
-
-    UNUSED(backend);
-    UNUSED(cgraph);
-}
-
-static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-
-    UNUSED(backend);
-    UNUSED(plan);
-}
-
-static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-
-    UNUSED(backend);
-    UNUSED(plan);
-}
-
-static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    ggml_cuda_set_main_device(cuda_ctx->device);
-
-    ggml_compute_params params = {};
-    params.type = GGML_TASK_COMPUTE;
-    params.ith = 0;
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-
-        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
-            continue;
-
-        assert(node->backend == GGML_BACKEND_GPU);
-        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
-        assert(node->extra != nullptr);
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j] != nullptr) {
-                assert(node->src[j]->backend == GGML_BACKEND_GPU);
-                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
-                assert(node->src[j]->extra != nullptr);
-            }
-        }
-
-        bool ok = ggml_cuda_compute_forward(&params, node);
-        if (!ok) {
-            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-        }
-        GGML_ASSERT(ok);
-
-#if 0
-        if (node->type == GGML_TYPE_F32) {
-            cudaDeviceSynchronize();
-            std::vector<float> tmp(ggml_nelements(node), 0.0f);
-            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
-            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
-                ggml_type_name(node->src[0]->type),
-                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
-                node->src[0]->name,
-                node->src[1] ? node->src[1]->name : "none");
-            double sum = 0.0;
-            double sq_sum = 0.0;
-            for (int i = 0; i < ggml_nelements(node); i++) {
-                printf("%f ", tmp[i]);
-                sum += tmp[i];
-                sq_sum += tmp[i]*tmp[i];
-            }
-            printf("\n");
-            printf("sum: %f, ", sum);
-            printf("sq_sum: %f\n", sq_sum);
-        }
-#endif
-    }
-
-    UNUSED(backend);
-}
-
-static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_SILU:
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_TANH:
-                    return true;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
-            {
-                struct ggml_tensor * a;
-                struct ggml_tensor * b;
-                if (op->op == GGML_OP_MUL_MAT) {
-                    a = op->src[0];
-                    b = op->src[1];
-                } else {
-                    a = op->src[2];
-                    b = op->src[1];
-                }
-                if (a->ne[3] != b->ne[3]) {
-                    return false;
-                }
-                return true;
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                switch (op->src[0]->type) {
-                    case GGML_TYPE_F16:
-                    case GGML_TYPE_F32:
-                    case GGML_TYPE_Q4_0:
-                    case GGML_TYPE_Q4_1:
-                    case GGML_TYPE_Q5_0:
-                    case GGML_TYPE_Q5_1:
-                    case GGML_TYPE_Q8_0:
-                        return true;
-                    default:
-                        return false;
-                }
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                ggml_type src1_type = op->src[1]->type;
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                return false;
-            } break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_NORM:
-        case GGML_OP_REPEAT:
-        case GGML_OP_DUP:
-        case GGML_OP_ADD:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_CLAMP:
-        case GGML_OP_CONT:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
-        case GGML_OP_ROPE:
-        case GGML_OP_ALIBI:
-        case GGML_OP_IM2COL:
-        case GGML_OP_SUM_ROWS:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_ACC:
-        case GGML_OP_CONCAT:
-        case GGML_OP_GROUP_NORM:
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_LEAKY_RELU:
-            return true;
-        default:
-            return false;
-    }
-
-    UNUSED(backend);
-}
-
-static ggml_backend_i cuda_backend_i = {
-    /* .get_name                = */ ggml_backend_cuda_name,
-    /* .free                    = */ ggml_backend_cuda_free,
-    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
-    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
-    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
-    /* .cpy_tensor_from_async   = */ NULL,
-    /* .cpy_tensor_to_async     = */ NULL,
-    /* .synchronize             = */ ggml_backend_cuda_synchronize,
-    /* .graph_plan_create       = */ ggml_backend_cuda_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cuda_graph_plan_free,
-    /* .graph_plan_compute      = */ ggml_backend_cuda_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
-    /* .supports_op             = */ ggml_backend_cuda_supports_op,
-};
-
-ggml_backend_t ggml_backend_cuda_init(int device) {
-    ggml_init_cublas(); // TODO: remove from ggml.c
-
-    if (device < 0 || device >= ggml_cuda_get_device_count()) {
-        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
-        return nullptr;
-    }
-
-    // not strictly necessary, but it may reduce the overhead of the first graph_compute
-    ggml_cuda_set_main_device(device);
-
-    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
-        /* .device = */ device
-    };
-
-    ggml_backend_t cuda_backend = new ggml_backend {
-        /* .interface = */ cuda_backend_i,
-        /* .context   = */ ctx
-    };
-
-    return cuda_backend;
-}
-
-bool ggml_backend_is_cuda(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_cuda_name;
-}
-
-static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
-    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
-    return cuda_backend;
-
-    UNUSED(params);
-}
-
-extern "C" int ggml_backend_cuda_reg_devices();
-
-int ggml_backend_cuda_reg_devices() {
-    int device_count = ggml_cuda_get_device_count();
-    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
-    for (int i = 0; i < device_count; i++) {
-        char name[128];
-        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
-        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
-    }
-    return device_count;
-}
diff --git a/dpcpp_out2/ggml-cuda.h b/dpcpp_out2/ggml-cuda.h
deleted file mode 100644
index cdb0c0c41618a..0000000000000
--- a/dpcpp_out2/ggml-cuda.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#pragma once
-
-#include "ggml.h"
-#include "ggml-backend.h"
-
-#ifdef GGML_USE_HIPBLAS
-#define GGML_CUDA_NAME "ROCm"
-#define GGML_CUBLAS_NAME "hipBLAS"
-#else
-#define GGML_CUDA_NAME "CUDA"
-#define GGML_CUBLAS_NAME "cuBLAS"
-#endif
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#define GGML_CUDA_MAX_DEVICES       16
-
-// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`.
-GGML_API void   ggml_init_cublas(void);
-
-// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
-GGML_API bool   ggml_cublas_loaded(void);
-
-GGML_API void * ggml_cuda_host_malloc(size_t size);
-GGML_API void   ggml_cuda_host_free(void * ptr);
-
-GGML_API bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_cuda_set_tensor_split(const float * tensor_split);
-GGML_API void   ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_free_data(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor);
-GGML_API void   ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
-GGML_API void   ggml_cuda_copy_to_device(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_cuda_set_main_device(int main_device);
-GGML_API void   ggml_cuda_set_mul_mat_q(bool mul_mat_q);
-GGML_API void   ggml_cuda_set_scratch_size(size_t scratch_size);
-GGML_API void   ggml_cuda_free_scratch(void);
-GGML_API bool   ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-
-GGML_API int    ggml_cuda_get_device_count(void);
-GGML_API void   ggml_cuda_get_device_description(int device, char * description, size_t description_size);
-
-// backend API
-GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
-
-GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
-GGML_API int  ggml_backend_cuda_get_device(ggml_backend_t backend);
-
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
-
-// pinned host buffer for use with CPU backend for faster copies between CPU and GPU
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/dpcpp_out2/ggml.h b/dpcpp_out2/ggml.h
deleted file mode 100644
index 5bb5323434e01..0000000000000
--- a/dpcpp_out2/ggml.h
+++ /dev/null
@@ -1,2253 +0,0 @@
-#pragma once
-
-//
-// GGML Tensor Library
-//
-// This documentation is still a work in progress.
-// If you wish some specific topics to be covered, feel free to drop a comment:
-//
-//   https://github.com/ggerganov/whisper.cpp/issues/40
-//
-// ## Overview
-//
-// This library implements:
-//
-//  - a set of tensor operations
-//  - automatic differentiation
-//  - basic optimization algorithms
-//
-// The aim of this library is to provide a minimalistic approach for various machine learning tasks. This includes,
-// but is not limited to, the following:
-//
-//  - linear regression
-//  - support vector machines
-//  - neural networks
-//
-// The library allows the user to define a certain function using the available tensor operations. This function
-// definition is represented internally via a computation graph. Each tensor operation in the function definition
-// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
-// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
-// using one of the available optimization algorithms.
-//
-// For example, here we define the function: f(x) = a*x^2 + b
-//
-//   {
-//       struct ggml_init_params params = {
-//           .mem_size   = 16*1024*1024,
-//           .mem_buffer = NULL,
-//       };
-//
-//       // memory allocation happens here
-//       struct ggml_context * ctx = ggml_init(params);
-//
-//       struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//
-//       ggml_set_param(ctx, x); // x is an input variable
-//
-//       struct ggml_tensor * a  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//       struct ggml_tensor * b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
-//       struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
-//       struct ggml_tensor * f  = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
-//
-//       ...
-//   }
-//
-// Notice that the function definition above does not involve any actual computation. The computation is performed only
-// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
-//
-//   {
-//       ...
-//
-//       struct ggml_cgraph * gf = ggml_new_graph(ctx);
-//       ggml_build_forward_expand(gf, f);
-//
-//       // set the input variable and parameter values
-//       ggml_set_f32(x, 2.0f);
-//       ggml_set_f32(a, 3.0f);
-//       ggml_set_f32(b, 4.0f);
-//
-//       ggml_graph_compute_with_ctx(ctx, &gf, n_threads);
-//
-//       printf("f = %f\n", ggml_get_f32_1d(f, 0));
-//
-//       ...
-//   }
-//
-// The actual computation is performed in the ggml_graph_compute() function.
-//
-// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
-// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
-// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
-// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
-// actually needed.
-//
-// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
-// differentiation and optimization algorithms.
-//
-// The described approach allows to define the function graph once and then compute its forward or backward graphs
-// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
-// the user can avoid the memory allocation overhead at runtime.
-//
-// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
-// citizens, but in theory the library can be extended to support FP8 and integer data types.
-//
-// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
-// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
-// clear that the library needs to support more complex operations. The way to support these operations is not clear
-// yet, but a few examples are demonstrated in the following operations:
-//
-//   - ggml_permute()
-//   - ggml_conv_1d_1s()
-//   - ggml_conv_1d_2s()
-//
-// For each tensor operator, the library implements a forward and backward computation function. The forward function
-// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
-// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
-// calculus class, or watch the following video:
-//
-//   What is Automatic Differentiation?
-//   https://www.youtube.com/watch?v=wG_nF1awSSY
-//
-//
-// ## Tensor data (struct ggml_tensor)
-//
-// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
-// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
-// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
-//
-//   {
-//       struct ggml_tensor * c = ggml_add(ctx, a, b);
-//
-//       assert(c->src[0] == a);
-//       assert(c->src[1] == b);
-//   }
-//
-// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
-// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
-// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
-// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
-// contiguous in memory.
-//
-// The data of the tensor is accessed via the "data" pointer. For example:
-//
-//   {
-//       const int nx = 2;
-//       const int ny = 3;
-//
-//       struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
-//
-//       for (int y = 0; y < ny; y++) {
-//           for (int x = 0; x < nx; x++) {
-//               *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
-//           }
-//       }
-//
-//       ...
-//   }
-//
-// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
-//
-// ## The matrix multiplication operator (ggml_mul_mat)
-//
-// TODO
-//
-//
-// ## Multi-threading
-//
-// TODO
-//
-//
-// ## Overview of ggml.c
-//
-// TODO
-//
-//
-// ## SIMD optimizations
-//
-// TODO
-//
-//
-// ## Debugging ggml
-//
-// TODO
-//
-//
-
-#ifdef GGML_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef GGML_BUILD
-#            define GGML_API __declspec(dllexport)
-#        else
-#            define GGML_API __declspec(dllimport)
-#        endif
-#    else
-#        define GGML_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define GGML_API
-#endif
-
-// TODO: support for clang
-#ifdef __GNUC__
-#    define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
-#elif defined(_MSC_VER)
-#    define GGML_DEPRECATED(func, hint) __declspec(deprecated(hint)) func
-#else
-#    define GGML_DEPRECATED(func, hint) func
-#endif
-
-#ifndef __GNUC__
-#    define GGML_ATTRIBUTE_FORMAT(...)
-#elif defined(__MINGW32__)
-#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
-#else
-#    define GGML_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
-#endif
-
-#define DPCT_PROFILING_ENABLED
-#include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
-#include <stdint.h>
-#include <stddef.h>
-#include <stdbool.h>
-
-#define GGML_FILE_MAGIC   0x67676d6c // "ggml"
-#define GGML_FILE_VERSION 1
-
-#define GGML_QNT_VERSION        2    // bump this on quantization format changes
-#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
-
-#define GGML_MAX_DIMS           4
-#define GGML_MAX_PARAMS         2048
-#define GGML_MAX_CONTEXTS       64
-#define GGML_MAX_SRC            10
-#define GGML_MAX_NAME           64
-#define GGML_MAX_OP_PARAMS      64
-#define GGML_DEFAULT_N_THREADS  4
-#define GGML_DEFAULT_GRAPH_SIZE 2048
-#if UINTPTR_MAX == 0xFFFFFFFF
-    #define GGML_MEM_ALIGN 4
-#else
-    #define GGML_MEM_ALIGN 16
-#endif
-
-#define GGML_EXIT_SUCCESS 0
-#define GGML_EXIT_ABORTED 1
-
-#define GGUF_MAGIC "GGUF"
-
-#define GGUF_VERSION 3
-
-#define GGUF_DEFAULT_ALIGNMENT 32
-
-#define GGML_UNUSED(x) (void)(x)
-
-#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
-
-#define GGML_ASSERT(x) \
-    do { \
-        if (!(x)) { \
-            fflush(stdout); \
-            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
-            ggml_print_backtrace(); \
-            abort(); \
-        } \
-    } while (0)
-
-#ifndef NDEBUG
-#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
-#elif defined(__GNUC__)
-#define GGML_UNREACHABLE() __builtin_unreachable()
-#elif defined(_MSC_VER)
-#define GGML_UNREACHABLE() __assume(0)
-#else
-#define GGML_UNREACHABLE() ((void) 0)
-#endif
-
-// used to copy the number of elements and stride in bytes of tensors into local variables.
-// main purpose is to reduce code duplication and improve readability.
-//
-// example:
-//
-//    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
-//    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb);
-//
-#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
-    const type prefix##0 = (pointer)->array[0]; \
-    GGML_UNUSED(prefix##0);
-#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
-    const type prefix##1 = (pointer)->array[1]; \
-    GGML_UNUSED(prefix##1);
-#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
-    const type prefix##2 = (pointer)->array[2]; \
-    GGML_UNUSED(prefix##2);
-#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
-    const type prefix##3 = (pointer)->array[3]; \
-    GGML_UNUSED(prefix##3);
-
-#define GGML_TENSOR_UNARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#define GGML_TENSOR_BINARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-#if defined(__ARM_NEON) && defined(SYCL_LANGUAGE_VERSION)
-    typedef half ggml_fp16_t;
-#elif defined(__ARM_NEON) && !defined(_MSC_VER)
-    typedef __fp16 ggml_fp16_t;
-#else
-    typedef uint16_t ggml_fp16_t;
-#endif
-
-    // convert FP16 <-> FP32
-    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
-    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
-
-    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int n);
-    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int n);
-
-    struct ggml_object;
-    struct ggml_context;
-
-    enum ggml_type {
-        GGML_TYPE_F32  = 0,
-        GGML_TYPE_F16  = 1,
-        GGML_TYPE_Q4_0 = 2,
-        GGML_TYPE_Q4_1 = 3,
-        // GGML_TYPE_Q4_2 = 4, support has been removed
-        // GGML_TYPE_Q4_3 (5) support has been removed
-        GGML_TYPE_Q5_0 = 6,
-        GGML_TYPE_Q5_1 = 7,
-        GGML_TYPE_Q8_0 = 8,
-        GGML_TYPE_Q8_1 = 9,
-        // k-quantizations
-        GGML_TYPE_Q2_K = 10,
-        GGML_TYPE_Q3_K = 11,
-        GGML_TYPE_Q4_K = 12,
-        GGML_TYPE_Q5_K = 13,
-        GGML_TYPE_Q6_K = 14,
-        GGML_TYPE_Q8_K = 15,
-        GGML_TYPE_I8,
-        GGML_TYPE_I16,
-        GGML_TYPE_I32,
-        GGML_TYPE_COUNT,
-    };
-
-    // precision
-    enum ggml_prec {
-        GGML_PREC_DEFAULT,
-        GGML_PREC_F32,
-    };
-
-    enum ggml_backend_type {
-        GGML_BACKEND_CPU = 0,
-        GGML_BACKEND_GPU = 10,
-        GGML_BACKEND_GPU_SPLIT = 20,
-    };
-
-    // model file types
-    enum ggml_ftype {
-        GGML_FTYPE_UNKNOWN     = -1,
-        GGML_FTYPE_ALL_F32     = 0,
-        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
-        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q2_K = 10, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q3_K = 11, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q4_K = 12, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors
-        GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors
-    };
-
-    // available tensor operations:
-    enum ggml_op {
-        GGML_OP_NONE = 0,
-
-        GGML_OP_DUP,
-        GGML_OP_ADD,
-        GGML_OP_ADD1,
-        GGML_OP_ACC,
-        GGML_OP_SUB,
-        GGML_OP_MUL,
-        GGML_OP_DIV,
-        GGML_OP_SQR,
-        GGML_OP_SQRT,
-        GGML_OP_LOG,
-        GGML_OP_SUM,
-        GGML_OP_SUM_ROWS,
-        GGML_OP_MEAN,
-        GGML_OP_ARGMAX,
-        GGML_OP_REPEAT,
-        GGML_OP_REPEAT_BACK,
-        GGML_OP_CONCAT,
-        GGML_OP_SILU_BACK,
-        GGML_OP_NORM, // normalize
-        GGML_OP_RMS_NORM,
-        GGML_OP_RMS_NORM_BACK,
-        GGML_OP_GROUP_NORM,
-
-        GGML_OP_MUL_MAT,
-        GGML_OP_MUL_MAT_ID,
-        GGML_OP_OUT_PROD,
-
-        GGML_OP_SCALE,
-        GGML_OP_SET,
-        GGML_OP_CPY,
-        GGML_OP_CONT,
-        GGML_OP_RESHAPE,
-        GGML_OP_VIEW,
-        GGML_OP_PERMUTE,
-        GGML_OP_TRANSPOSE,
-        GGML_OP_GET_ROWS,
-        GGML_OP_GET_ROWS_BACK,
-        GGML_OP_DIAG,
-        GGML_OP_DIAG_MASK_INF,
-        GGML_OP_DIAG_MASK_ZERO,
-        GGML_OP_SOFT_MAX,
-        GGML_OP_SOFT_MAX_BACK,
-        GGML_OP_ROPE,
-        GGML_OP_ROPE_BACK,
-        GGML_OP_ALIBI,
-        GGML_OP_CLAMP,
-        GGML_OP_CONV_TRANSPOSE_1D,
-        GGML_OP_IM2COL,
-        GGML_OP_CONV_TRANSPOSE_2D,
-        GGML_OP_POOL_1D,
-        GGML_OP_POOL_2D,
-        GGML_OP_UPSCALE, // nearest interpolate
-        GGML_OP_PAD,
-        GGML_OP_ARGSORT,
-        GGML_OP_LEAKY_RELU,
-
-        GGML_OP_FLASH_ATTN,
-        GGML_OP_FLASH_FF,
-        GGML_OP_FLASH_ATTN_BACK,
-        GGML_OP_WIN_PART,
-        GGML_OP_WIN_UNPART,
-        GGML_OP_GET_REL_POS,
-        GGML_OP_ADD_REL_POS,
-
-        GGML_OP_UNARY,
-
-        GGML_OP_MAP_UNARY,
-        GGML_OP_MAP_BINARY,
-
-        GGML_OP_MAP_CUSTOM1_F32,
-        GGML_OP_MAP_CUSTOM2_F32,
-        GGML_OP_MAP_CUSTOM3_F32,
-
-        GGML_OP_MAP_CUSTOM1,
-        GGML_OP_MAP_CUSTOM2,
-        GGML_OP_MAP_CUSTOM3,
-
-        GGML_OP_CROSS_ENTROPY_LOSS,
-        GGML_OP_CROSS_ENTROPY_LOSS_BACK,
-
-        GGML_OP_COUNT,
-    };
-
-    enum ggml_unary_op {
-        GGML_UNARY_OP_ABS,
-        GGML_UNARY_OP_SGN,
-        GGML_UNARY_OP_NEG,
-        GGML_UNARY_OP_STEP,
-        GGML_UNARY_OP_TANH,
-        GGML_UNARY_OP_ELU,
-        GGML_UNARY_OP_RELU,
-        GGML_UNARY_OP_GELU,
-        GGML_UNARY_OP_GELU_QUICK,
-        GGML_UNARY_OP_SILU,
-
-        GGML_UNARY_OP_COUNT,
-    };
-
-    enum ggml_object_type {
-        GGML_OBJECT_TENSOR,
-        GGML_OBJECT_GRAPH,
-        GGML_OBJECT_WORK_BUFFER
-    };
-
-    enum ggml_log_level {
-        GGML_LOG_LEVEL_ERROR = 2,
-        GGML_LOG_LEVEL_WARN = 3,
-        GGML_LOG_LEVEL_INFO = 4,
-        GGML_LOG_LEVEL_DEBUG = 5
-    };
-
-    // ggml object
-    struct ggml_object {
-        size_t offs;
-        size_t size;
-
-        struct ggml_object * next;
-
-        enum ggml_object_type type;
-
-        char padding[4];
-    };
-
-    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
-    // n-dimensional tensor
-    struct ggml_tensor {
-        enum ggml_type         type;
-        enum ggml_backend_type backend;
-
-        struct ggml_backend_buffer * buffer;
-
-        int64_t ne[GGML_MAX_DIMS]; // number of elements
-        size_t  nb[GGML_MAX_DIMS]; // stride in bytes:
-                                   // nb[0] = ggml_type_size(type)
-                                   // nb[1] = nb[0]   * (ne[0] / ggml_blck_size(type)) + padding
-                                   // nb[i] = nb[i-1] * ne[i-1]
-
-        // compute data
-        enum ggml_op op;
-
-        // op params - allocated as int32_t for alignment
-        int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
-
-        bool is_param;
-
-        struct ggml_tensor * grad;
-        struct ggml_tensor * src[GGML_MAX_SRC];
-
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-
-        struct ggml_tensor * view_src;
-        size_t               view_offs;
-
-        void * data;
-
-        char name[GGML_MAX_NAME];
-
-        void * extra; // extra things e.g. for ggml-cuda.cu
-
-        char padding[8];
-    };
-
-    static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
-
-    // the compute plan that needs to be prepared for ggml_graph_compute()
-    // since https://github.com/ggerganov/ggml/issues/287
-    struct ggml_cplan {
-        size_t    work_size; // size of work buffer, calculated by `ggml_graph_plan()`
-        uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
-
-        int n_threads;
-
-        // abort ggml_graph_compute when true
-        bool (*abort_callback)(void * data);
-        void * abort_callback_data;
-    };
-
-    enum ggml_cgraph_eval_order {
-        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
-
-    struct ggml_hash_set {
-        size_t size;
-        struct ggml_tensor ** keys;
-    };
-
-    // computation graph
-    struct ggml_cgraph {
-        int size;
-        int n_nodes;
-        int n_leafs;
-
-        struct ggml_tensor ** nodes;
-        struct ggml_tensor ** grads;
-        struct ggml_tensor ** leafs;
-
-        struct ggml_hash_set visited_hash_table;
-
-        enum ggml_cgraph_eval_order order;
-
-        // performance
-        int     perf_runs;
-        int64_t perf_cycles;
-        int64_t perf_time_us;
-    };
-
-    // scratch buffer
-    struct ggml_scratch {
-        size_t offs;
-        size_t size;
-        void * data;
-    };
-
-    struct ggml_init_params {
-        // memory pool
-        size_t mem_size;   // bytes
-        void * mem_buffer; // if NULL, memory will be allocated internally
-        bool   no_alloc;   // don't allocate memory for the tensor data
-    };
-
-
-    // compute types
-
-    // NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
-    // This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
-    enum ggml_task_type {
-        GGML_TASK_INIT = 0,
-        GGML_TASK_COMPUTE,
-        GGML_TASK_FINALIZE,
-    };
-
-    struct ggml_compute_params {
-        enum ggml_task_type type;
-
-        // ith = thread index, nth = number of threads
-        int ith, nth;
-
-        // work buffer for all threads
-        size_t wsize;
-        void * wdata;
-    };
-
-    // misc
-
-    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
-    GGML_API int64_t ggml_time_ms(void);
-    GGML_API int64_t ggml_time_us(void);
-    GGML_API int64_t ggml_cycles(void);
-    GGML_API int64_t ggml_cycles_per_ms(void);
-
-    GGML_API void    ggml_print_backtrace(void);
-
-    GGML_API void    ggml_numa_init(void); // call once for better performance on NUMA systems
-    GGML_API bool    ggml_is_numa(void); // true if init detected that system has >1 NUMA node
-
-    GGML_API void    ggml_print_object (const struct ggml_object * obj);
-    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
-
-    GGML_API int64_t ggml_nelements   (const struct ggml_tensor * tensor);
-    GGML_API int64_t ggml_nrows       (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes      (const struct ggml_tensor * tensor);
-    GGML_API size_t  ggml_nbytes_pad  (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
-
-    GGML_API int    ggml_blck_size(enum ggml_type type);
-    GGML_API size_t ggml_type_size(enum ggml_type type);             // size in bytes for all elements in a block
-    GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
-
-    GGML_DEPRECATED(
-    GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
-    "use ggml_row_size() instead");
-
-    GGML_API const char * ggml_type_name(enum ggml_type type);
-    GGML_API const char * ggml_op_name  (enum ggml_op   op);
-    GGML_API const char * ggml_op_symbol(enum ggml_op   op);
-
-    GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
-    GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
-
-    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
-
-    GGML_API bool    ggml_is_quantized(enum ggml_type type);
-
-    // TODO: temporary until model loading of ggml examples is refactored
-    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
-
-    GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_permuted  (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_scalar    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_vector    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_matrix    (const struct ggml_tensor * tensor);
-    GGML_API bool ggml_is_3d        (const struct ggml_tensor * tensor);
-    GGML_API int  ggml_n_dims       (const struct ggml_tensor * tensor); // returns 1 for scalars
-
-    GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
-
-    // use this to compute the memory overhead of a tensor
-    GGML_API size_t ggml_tensor_overhead(void);
-
-    // main
-
-    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
-    GGML_API void                  ggml_free(struct ggml_context * ctx);
-
-    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
-
-    GGML_API size_t  ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
-    GGML_API bool    ggml_get_no_alloc(struct ggml_context * ctx);
-    GGML_API void    ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
-
-    GGML_API void *  ggml_get_mem_buffer     (const struct ggml_context * ctx);
-    GGML_API size_t  ggml_get_mem_size       (const struct ggml_context * ctx);
-    GGML_API size_t  ggml_get_max_tensor_size(const struct ggml_context * ctx);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int    n_dims,
-            const int64_t *ne);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_1d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_2d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0,
-            int64_t ne1);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_3d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0,
-            int64_t ne1,
-            int64_t ne2);
-
-    GGML_API struct ggml_tensor * ggml_new_tensor_4d(
-            struct ggml_context * ctx,
-            enum   ggml_type type,
-            int64_t ne0,
-            int64_t ne1,
-            int64_t ne2,
-            int64_t ne3);
-
-    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-
-    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
-
-    // Context tensor enumeration and lookup
-    GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
-    GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
-
-    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-
-    // Converts a flat index into coordinates
-    GGML_API void    ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
-
-    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-
-    GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
-
-    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
-
-    GGML_API float   ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
-    GGML_API void    ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
-
-    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
-    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
-
-    GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
-
-    GGML_API const char *         ggml_get_name   (const struct ggml_tensor * tensor);
-    GGML_API struct ggml_tensor * ggml_set_name   (      struct ggml_tensor * tensor, const char * name);
-    GGML_ATTRIBUTE_FORMAT(2, 3)
-    GGML_API struct ggml_tensor * ggml_format_name(      struct ggml_tensor * tensor, const char * fmt, ...);
-
-    //
-    // operations on tensors with backpropagation
-    //
-
-    GGML_API struct ggml_tensor * ggml_dup(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_dup_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_add(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_add_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_add_cast(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            enum   ggml_type      type);
-
-    GGML_API struct ggml_tensor * ggml_add1(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_add1_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // dst = a
-    // view(dst, nb1, nb2, nb3, offset) += b
-    // return dst
-    GGML_API struct ggml_tensor * ggml_acc(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                nb2,
-            size_t                nb3,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_acc_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                nb2,
-            size_t                nb3,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_sub(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_sub_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_mul(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_mul_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_div(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_div_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_sqr(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sqr_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sqrt(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sqrt_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_log(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_log_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // return scalar
-    GGML_API struct ggml_tensor * ggml_sum(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
-    GGML_API struct ggml_tensor * ggml_sum_rows(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // mean along rows
-    GGML_API struct ggml_tensor * ggml_mean(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // argmax along rows
-    GGML_API struct ggml_tensor * ggml_argmax(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // if a is the same shape as b, and a is not parameter, return a
-    // otherwise, return a new tensor: repeat(a) to fit in b
-    GGML_API struct ggml_tensor * ggml_repeat(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // sums repetitions in a into shape of b
-    GGML_API struct ggml_tensor * ggml_repeat_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // concat a and b on dim 2
-    // used in stable-diffusion
-    GGML_API struct ggml_tensor * ggml_concat(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_abs(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_abs_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sgn(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_sgn_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_neg(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_neg_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_step(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_step_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_tanh(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_tanh_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_elu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_elu_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_relu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_leaky_relu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a, float negative_slope, bool inplace);
-
-    GGML_API struct ggml_tensor * ggml_relu_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_gelu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_gelu_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_gelu_quick(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_gelu_quick_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_silu(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_silu_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // a - x
-    // b - dy
-    GGML_API struct ggml_tensor * ggml_silu_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // normalize along rows
-    GGML_API struct ggml_tensor * ggml_norm(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 eps);
-
-    GGML_API struct ggml_tensor * ggml_norm_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 eps);
-
-    GGML_API struct ggml_tensor * ggml_rms_norm(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 eps);
-
-    GGML_API struct ggml_tensor * ggml_rms_norm_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 eps);
-
-    // group normalize along ne0*ne1*n_groups
-    // used in stable-diffusion
-    // TODO: eps is hardcoded to 1e-6 for now
-    GGML_API struct ggml_tensor * ggml_group_norm(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_groups);
-
-    GGML_API struct ggml_tensor * ggml_group_norm_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_groups);
-
-    // a - x
-    // b - dy
-    GGML_API struct ggml_tensor * ggml_rms_norm_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            float                 eps);
-
-    // A: k columns, n rows => [ne03, ne02, n, k]
-    // B: k columns, m rows  (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
-    // result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
-    GGML_API struct ggml_tensor * ggml_mul_mat(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // change the precision of a matrix multiplication
-    // set to GGML_PREC_F32 for higher precision (useful for phi-2)
-    GGML_API void ggml_mul_mat_set_prec(
-            struct ggml_tensor * a,
-            enum ggml_prec       prec);
-
-    // indirect matrix multiplication
-    //  ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
-    GGML_API struct ggml_tensor * ggml_mul_mat_id(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * const as[],
-            int                   n_as,
-            struct ggml_tensor  * ids,
-            int                   id,
-            struct ggml_tensor  * b);
-
-    // A: m columns, n rows,
-    // B: p columns, n rows,
-    // result is m columns, p rows
-    GGML_API struct ggml_tensor * ggml_out_prod(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    //
-    // operations on tensors without backpropagation
-    //
-
-    GGML_API struct ggml_tensor * ggml_scale(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 s);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_scale_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 s);
-
-    // b -> view(a,offset,nb1,nb2,3), return modified a
-    GGML_API struct ggml_tensor * ggml_set(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                nb2,
-            size_t                nb3,
-            size_t                offset);
-
-    // b -> view(a,offset,nb1,nb2,3), return view(a)
-    GGML_API struct ggml_tensor * ggml_set_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                nb2,
-            size_t                nb3,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_set_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                offset);
-
-    // b -> view(a,offset,nb1,nb2,3), return modified a
-    GGML_API struct ggml_tensor * ggml_set_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                offset);
-
-    // b -> view(a,offset,nb1,nb2,3), return view(a)
-    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            size_t                nb1,
-            size_t                offset);
-
-    // a -> b, return view(b)
-    GGML_API struct ggml_tensor * ggml_cpy(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // a -> b, in-place, return view(b)
-    GGML_API struct ggml_tensor * ggml_cpy_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // make contiguous
-    GGML_API struct ggml_tensor * ggml_cont(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // make contiguous, in-place
-    GGML_API struct ggml_tensor * ggml_cont_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // make contiguous, with new shape
-    GGML_API struct ggml_tensor * ggml_cont_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0);
-
-    GGML_API struct ggml_tensor * ggml_cont_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1);
-
-    GGML_API struct ggml_tensor * ggml_cont_3d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2);
-
-    GGML_API struct ggml_tensor * ggml_cont_4d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            int64_t               ne3);
-
-    // return view(a), b specifies the new shape
-    // TODO: when we start computing gradient, make a copy instead of view
-    GGML_API struct ggml_tensor * ggml_reshape(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // return view(a)
-    // TODO: when we start computing gradient, make a copy instead of view
-    GGML_API struct ggml_tensor * ggml_reshape_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0);
-
-    GGML_API struct ggml_tensor * ggml_reshape_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1);
-
-    // return view(a)
-    // TODO: when we start computing gradient, make a copy instead of view
-    GGML_API struct ggml_tensor * ggml_reshape_3d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2);
-
-    GGML_API struct ggml_tensor * ggml_reshape_4d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            int64_t               ne3);
-
-    // offset in bytes
-    GGML_API struct ggml_tensor * ggml_view_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_view_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            size_t                nb1, // row stride in bytes
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_view_3d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            size_t                nb1, // row   stride in bytes
-            size_t                nb2, // slice stride in bytes
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_view_4d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int64_t               ne0,
-            int64_t               ne1,
-            int64_t               ne2,
-            int64_t               ne3,
-            size_t                nb1, // row   stride in bytes
-            size_t                nb2, // slice stride in bytes
-            size_t                nb3,
-            size_t                offset);
-
-    GGML_API struct ggml_tensor * ggml_permute(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   axis0,
-            int                   axis1,
-            int                   axis2,
-            int                   axis3);
-
-    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
-    GGML_API struct ggml_tensor * ggml_transpose(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // supports 3D: a->ne[2] == b->ne[1]
-    GGML_API struct ggml_tensor * ggml_get_rows(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_get_rows_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            struct ggml_tensor  * c);
-
-    GGML_API struct ggml_tensor * ggml_diag(
-        struct ggml_context     * ctx,
-        struct ggml_tensor      * a);
-
-    // set elements above the diagonal to -INF
-    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past);
-
-    // set elements above the diagonal to 0
-    GGML_API struct ggml_tensor * ggml_diag_mask_zero(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_diag_mask_zero_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past);
-
-    GGML_API struct ggml_tensor * ggml_soft_max(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_soft_max_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // fused soft_max(a*scale + mask)
-    // mask is optional
-    GGML_API struct ggml_tensor * ggml_soft_max_ext(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * mask,
-            float                 scale);
-
-    GGML_API struct ggml_tensor * ggml_soft_max_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // rotary position embedding
-    // if mode & 1 == 1, skip n_past elements (DEPRECATED)
-    // if mode & 2 == 1, GPT-NeoX style
-    // if mode & 4 == 1, ChatGLM style
-    //
-    // b is an int32 vector with size a->ne[2], it contains the positions
-    GGML_API struct ggml_tensor * ggml_rope(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_rope_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx);
-
-    // custom RoPE
-    GGML_API struct ggml_tensor * ggml_rope_custom(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
-            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_rope_custom_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
-            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow);
-
-    // compute correction dims for YaRN RoPE scaling
-    void ggml_rope_yarn_corr_dims(
-        int n_dims, int n_orig_ctx, float freq_base, float beta_fast, float beta_slow, float dims[2]);
-
-    // xPos RoPE, in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_rope_xpos_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            float                 base,
-            bool                  down);
-
-    // rotary position embedding backward, i.e compute dx from dy
-    // a - dy
-    GGML_API struct ggml_tensor * ggml_rope_back(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   n_dims,
-            int                   mode,
-            int                   n_ctx,
-            int                   n_orig_ctx,
-            float                 freq_base,
-            float                 freq_scale,
-            float                 ext_factor,
-            float                 attn_factor,
-            float                 beta_fast,
-            float                 beta_slow,
-            float                 xpos_base,
-            bool                  xpos_down);
-
-    // alibi position embedding
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_alibi(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   n_past,
-            int                   n_head,
-            float                 bias_max);
-
-    // clamp
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_clamp(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            float                 min,
-            float                 max);
-
-    GGML_API struct ggml_tensor * ggml_im2col(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                  s0,
-            int                  s1,
-            int                  p0,
-            int                  p1,
-            int                  d0,
-            int                  d1,
-            bool                 is_2D);
-
-    GGML_API struct ggml_tensor * ggml_conv_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,  // stride
-            int                   p0,  // padding
-            int                   d0); // dilation
-
-    // conv_1d with padding = half
-    // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
-    GGML_API struct ggml_tensor* ggml_conv_1d_ph(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s,
-            int                   d);
-
-    GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   p0,
-            int                   d0);
-
-    GGML_API struct ggml_tensor * ggml_conv_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   s0,
-            int                   s1,
-            int                   p0,
-            int                   p1,
-            int                   d0,
-            int                   d1);
-
-
-    // kernel size is a->ne[0] x a->ne[1]
-    // stride is equal to kernel size
-    // padding is zero
-    // example:
-    // a:     16   16    3  768
-    // b:   1024 1024    3    1
-    // res:   64   64  768    1
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_conv_2d_sk_p0(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    // kernel size is a->ne[0] x a->ne[1]
-    // stride is 1
-    // padding is half
-    // example:
-    // a:      3    3    256  256
-    // b:     64   64    256    1
-    // res:   64   64    256    1
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_conv_2d_s1_ph(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b);
-
-    GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b,
-            int                   stride);
-
-    enum ggml_op_pool {
-        GGML_OP_POOL_MAX,
-        GGML_OP_POOL_AVG,
-        GGML_OP_POOL_COUNT,
-    };
-
-    GGML_API struct ggml_tensor * ggml_pool_1d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_op_pool     op,
-            int                   k0, // kernel size
-            int                   s0, // stride
-            int                   p0); // padding
-
-    // the result will have 2*p0 padding for the first dimension
-    // and 2*p1 padding for the second dimension
-    GGML_API struct ggml_tensor * ggml_pool_2d(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_op_pool     op,
-            int                   k0,
-            int                   k1,
-            int                   s0,
-            int                   s1,
-            float                 p0,
-            float                 p1);
-
-    // nearest interpolate
-    // used in stable-diffusion
-    GGML_API struct ggml_tensor * ggml_upscale(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   scale_factor);
-
-    // pad each dimension with zeros: [x, ..., x] -> [x, ..., x, 0, ..., 0]
-    GGML_API struct ggml_tensor * ggml_pad(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                  p0,
-            int                  p1,
-            int                  p2,
-            int                  p3);
-
-    // sort rows
-    enum ggml_sort_order {
-        GGML_SORT_ASC,
-        GGML_SORT_DESC,
-    };
-
-    GGML_API struct ggml_tensor * ggml_argsort(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            enum ggml_sort_order  order);
-
-    // top k elements per row
-    GGML_API struct ggml_tensor * ggml_top_k(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   k);
-
-    GGML_API struct ggml_tensor * ggml_flash_attn(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * q,
-            struct ggml_tensor  * k,
-            struct ggml_tensor  * v,
-            bool                  masked);
-
-    GGML_API struct ggml_tensor * ggml_flash_attn_back(
-           struct ggml_context * ctx,
-           struct ggml_tensor  * q,
-           struct ggml_tensor  * k,
-           struct ggml_tensor  * v,
-           struct ggml_tensor  * d,
-           bool                  masked);
-
-    GGML_API struct ggml_tensor * ggml_flash_ff(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * b0,
-            struct ggml_tensor  * b1,
-            struct ggml_tensor  * c0,
-            struct ggml_tensor  * c1);
-
-    // partition into non-overlapping windows with padding if needed
-    // example:
-    // a:   768   64   64    1
-    // w:    14
-    // res: 768   14   14    25
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_win_part(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   w);
-
-    // reverse of ggml_win_part
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_win_unpart(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   w0,
-            int                   h0,
-            int                   w);
-
-    GGML_API struct ggml_tensor * ggml_unary(
-            struct ggml_context * ctx,
-             struct ggml_tensor * a,
-             enum ggml_unary_op op);
-
-    GGML_API struct ggml_tensor * ggml_unary_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        enum ggml_unary_op op);
-
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_get_rel_pos(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            int                   qh,
-            int                   kh);
-
-    // used in sam
-    GGML_API struct ggml_tensor * ggml_add_rel_pos(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * pw,
-            struct ggml_tensor  * ph);
-
-    GGML_API struct ggml_tensor * ggml_add_rel_pos_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a,
-            struct ggml_tensor  * pw,
-            struct ggml_tensor  * ph);
-
-    // custom operators
-
-    typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
-    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-
-    typedef void (*ggml_custom1_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom2_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-    typedef void (*ggml_custom3_op_f32_t)(struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *, const struct ggml_tensor *);
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_f32(
-            struct ggml_context        * ctx,
-            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun),
-        "use ggml_map_custom1 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_unary_inplace_f32(
-            struct ggml_context        * ctx,
-            struct ggml_tensor         * a,
-                   ggml_unary_op_f32_t   fun),
-        "use ggml_map_custom1_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_f32(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun),
-        "use ggml_map_custom2 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_binary_inplace_f32(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-                   ggml_binary_op_f32_t   fun),
-        "use ggml_map_custom2_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun),
-        "use ggml_map_custom1 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom1_inplace_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-                   ggml_custom1_op_f32_t   fun),
-        "use ggml_map_custom1_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun),
-        "use ggml_map_custom2 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom2_inplace_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-                   ggml_custom2_op_f32_t   fun),
-        "use ggml_map_custom2_inplace instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun),
-        "use ggml_map_custom3 instead");
-
-    GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_map_custom3_inplace_f32(
-            struct ggml_context          * ctx,
-            struct ggml_tensor           * a,
-            struct ggml_tensor           * b,
-            struct ggml_tensor           * c,
-                   ggml_custom3_op_f32_t   fun),
-        "use ggml_map_custom3_inplace instead");
-
-    // custom operators v2
-
-    typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata);
-    typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
-    typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
-
-    #define GGML_N_TASKS_MAX -1
-
-    GGML_API struct ggml_tensor * ggml_map_custom1(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            ggml_custom1_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom1_inplace(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            ggml_custom1_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom2(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            struct ggml_tensor    * b,
-            ggml_custom2_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom2_inplace(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            struct ggml_tensor    * b,
-            ggml_custom2_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom3(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            struct ggml_tensor    * b,
-            struct ggml_tensor    * c,
-            ggml_custom3_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    GGML_API struct ggml_tensor * ggml_map_custom3_inplace(
-            struct ggml_context   * ctx,
-            struct ggml_tensor    * a,
-            struct ggml_tensor    * b,
-            struct ggml_tensor    * c,
-            ggml_custom3_op_t       fun,
-            int                     n_tasks,
-            void                  * userdata);
-
-    // loss function
-
-    GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b);
-
-    GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
-            struct ggml_context         * ctx,
-            struct ggml_tensor          * a,
-            struct ggml_tensor          * b,
-            struct ggml_tensor          * c);
-
-    //
-    // automatic differentiation
-    //
-
-    GGML_API void ggml_set_param(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * tensor);
-
-
-    GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-    GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
-
-    // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
-    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
-    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
-
-    GGML_API size_t ggml_graph_overhead(void);
-    GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
-
-    // ggml_graph_plan() has to be called before ggml_graph_compute()
-    // when plan.work_size > 0, caller must allocate memory for plan.work_data
-    GGML_API struct ggml_cplan ggml_graph_plan   (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
-    GGML_API int               ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
-
-    // same as ggml_graph_compute() but the work data is allocated as a part of the context
-    // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
-    GGML_API void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
-
-    GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
-
-    GGML_API void                 ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
-    GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
-
-    // print info and performance information for the graph
-    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
-
-    // dump the graph into a file using the dot format
-    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
-
-    // build gradient checkpointing backward graph gb for gf using provided checkpoints
-    // gb_tmp will contain original backward graph with rewritten backward process nodes,
-    // but without the second forward pass nodes.
-    GGML_API void ggml_build_backward_gradient_checkpointing(
-            struct ggml_context   * ctx,
-            struct ggml_cgraph    * gf,
-            struct ggml_cgraph    * gb,
-            struct ggml_cgraph    * gb_tmp,
-            struct ggml_tensor  * * checkpoints,
-            int                     n_checkpoints);
-    //
-    // optimization
-    //
-
-    // optimization methods
-    enum ggml_opt_type {
-        GGML_OPT_ADAM,
-        GGML_OPT_LBFGS,
-    };
-
-    // linesearch methods
-    enum ggml_linesearch {
-        GGML_LINESEARCH_DEFAULT = 1,
-
-        GGML_LINESEARCH_BACKTRACKING_ARMIJO       = 0,
-        GGML_LINESEARCH_BACKTRACKING_WOLFE        = 1,
-        GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
-    };
-
-    // optimization return values
-    enum ggml_opt_result {
-        GGML_OPT_OK = 0,
-        GGML_OPT_DID_NOT_CONVERGE,
-        GGML_OPT_NO_CONTEXT,
-        GGML_OPT_INVALID_WOLFE,
-        GGML_OPT_FAIL,
-        GGML_OPT_CANCEL,
-
-        GGML_LINESEARCH_FAIL = -128,
-        GGML_LINESEARCH_MINIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_STEP,
-        GGML_LINESEARCH_MAXIMUM_ITERATIONS,
-        GGML_LINESEARCH_INVALID_PARAMETERS,
-    };
-
-    typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
-    typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
-
-    // optimization parameters
-    //
-    //   see ggml.c (ggml_opt_default_params) for default values
-    //
-    struct ggml_opt_params {
-        enum ggml_opt_type type;
-
-        size_t graph_size;
-
-        int n_threads;
-
-        // delta-based convergence test
-        //
-        //   if past == 0 - disabled
-        //   if past > 0:
-        //     stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
-        //
-        int past;
-        float delta;
-
-        // maximum number of iterations without improvement
-        //
-        //   if 0 - disabled
-        //   if > 0:
-        //     assume convergence if no cost improvement in this number of iterations
-        //
-        int max_no_improvement;
-
-        bool print_forward_graph;
-        bool print_backward_graph;
-
-        int n_gradient_accumulation;
-
-        // ADAM parameters
-        struct {
-            int n_iter;
-
-            float sched; // schedule multiplier (fixed, decay or warmup)
-            float decay; // weight decay for AdamW, use 0.0f to disable
-            int   decay_min_ndim; // minimum number of tensor dimension to apply weight decay
-            float alpha; // learning rate
-            float beta1;
-            float beta2;
-            float eps;   // epsilon for numerical stability
-            float eps_f; // epsilon for convergence test
-            float eps_g; // epsilon for convergence test
-            float gclip; // gradient clipping
-        } adam;
-
-        // LBFGS parameters
-        struct {
-            int m; // number of corrections to approximate the inv. Hessian
-            int n_iter;
-            int max_linesearch;
-
-            float eps;      // convergence tolerance
-            float ftol;     // line search tolerance
-            float wolfe;
-            float min_step;
-            float max_step;
-
-            enum ggml_linesearch linesearch;
-        } lbfgs;
-    };
-
-    struct ggml_opt_context {
-        struct ggml_context * ctx;
-        struct ggml_opt_params params;
-
-        int iter;
-        int64_t nx; // number of parameter elements
-
-        bool just_initialized;
-
-        float loss_before;
-        float loss_after;
-
-        struct {
-            struct ggml_tensor * g;  // current gradient
-            struct ggml_tensor * m;  // first moment
-            struct ggml_tensor * v;  // second moment
-            struct ggml_tensor * pf; // past function values
-            float fx_best;
-            float fx_prev;
-            int n_no_improvement;
-        } adam;
-
-        struct {
-            struct ggml_tensor * x;    // current parameters
-            struct ggml_tensor * xp;   // previous parameters
-            struct ggml_tensor * g;    // current gradient
-            struct ggml_tensor * gp;   // previous gradient
-            struct ggml_tensor * d;    // search direction
-            struct ggml_tensor * pf;   // past function values
-            struct ggml_tensor * lmal; // the L-BFGS memory alpha
-            struct ggml_tensor * lmys; // the L-BFGS memory ys
-            struct ggml_tensor * lms;  // the L-BFGS memory s
-            struct ggml_tensor * lmy;  // the L-BFGS memory y
-            float fx_best;
-            float step;
-            int j;
-            int k;
-            int end;
-            int n_no_improvement;
-        } lbfgs;
-    };
-
-    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
-
-    // optimize the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt(
-            struct ggml_context * ctx,
-            struct ggml_opt_params params,
-            struct ggml_tensor * f);
-
-    // initialize optimizer context
-    GGML_API void ggml_opt_init(
-            struct ggml_context     * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_opt_params    params,
-            int64_t                   nx);
-
-    // continue optimizing the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt_resume(
-            struct ggml_context * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_tensor * f);
-
-    // continue optimizing the function defined by the tensor f
-    GGML_API enum ggml_opt_result ggml_opt_resume_g(
-            struct ggml_context * ctx,
-            struct ggml_opt_context * opt,
-            struct ggml_tensor * f,
-            struct ggml_cgraph * gf,
-            struct ggml_cgraph * gb,
-            ggml_opt_callback callback,
-            void * callback_data);
-
-    //
-    // quantization
-    //
-
-    // TODO: these would probably get removed in favor of the more general ggml_quantize_chunk
-    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
-
-    GGML_API size_t ggml_quantize_q2_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q3_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q4_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist);
-    GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist);
-
-    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
-
-    //
-    // gguf
-    //
-
-    enum gguf_type {
-        GGUF_TYPE_UINT8   = 0,
-        GGUF_TYPE_INT8    = 1,
-        GGUF_TYPE_UINT16  = 2,
-        GGUF_TYPE_INT16   = 3,
-        GGUF_TYPE_UINT32  = 4,
-        GGUF_TYPE_INT32   = 5,
-        GGUF_TYPE_FLOAT32 = 6,
-        GGUF_TYPE_BOOL    = 7,
-        GGUF_TYPE_STRING  = 8,
-        GGUF_TYPE_ARRAY   = 9,
-        GGUF_TYPE_UINT64  = 10,
-        GGUF_TYPE_INT64   = 11,
-        GGUF_TYPE_FLOAT64 = 12,
-        GGUF_TYPE_COUNT,       // marks the end of the enum
-    };
-
-    struct gguf_context;
-
-    struct gguf_init_params {
-        bool no_alloc;
-
-        // if not NULL, create a ggml_context and allocate the tensor data in it
-        struct ggml_context ** ctx;
-    };
-
-    GGML_API struct gguf_context * gguf_init_empty(void);
-    GGML_API struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_params params);
-    //GGML_API struct gguf_context * gguf_init_from_buffer(..);
-
-    GGML_API void gguf_free(struct gguf_context * ctx);
-
-    GGML_API const char * gguf_type_name(enum gguf_type type);
-
-    GGML_API int    gguf_get_version    (const struct gguf_context * ctx);
-    GGML_API size_t gguf_get_alignment  (const struct gguf_context * ctx);
-    GGML_API size_t gguf_get_data_offset(const struct gguf_context * ctx);
-    GGML_API void * gguf_get_data       (const struct gguf_context * ctx);
-
-    GGML_API int          gguf_get_n_kv(const struct gguf_context * ctx);
-    GGML_API int          gguf_find_key(const struct gguf_context * ctx, const char * key);
-    GGML_API const char * gguf_get_key (const struct gguf_context * ctx, int key_id);
-
-    GGML_API enum gguf_type gguf_get_kv_type (const struct gguf_context * ctx, int key_id);
-    GGML_API enum gguf_type gguf_get_arr_type(const struct gguf_context * ctx, int key_id);
-
-    // will abort if the wrong type is used for the key
-    GGML_API uint8_t      gguf_get_val_u8  (const struct gguf_context * ctx, int key_id);
-    GGML_API int8_t       gguf_get_val_i8  (const struct gguf_context * ctx, int key_id);
-    GGML_API uint16_t     gguf_get_val_u16 (const struct gguf_context * ctx, int key_id);
-    GGML_API int16_t      gguf_get_val_i16 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint32_t     gguf_get_val_u32 (const struct gguf_context * ctx, int key_id);
-    GGML_API int32_t      gguf_get_val_i32 (const struct gguf_context * ctx, int key_id);
-    GGML_API float        gguf_get_val_f32 (const struct gguf_context * ctx, int key_id);
-    GGML_API uint64_t     gguf_get_val_u64 (const struct gguf_context * ctx, int key_id);
-    GGML_API int64_t      gguf_get_val_i64 (const struct gguf_context * ctx, int key_id);
-    GGML_API double       gguf_get_val_f64 (const struct gguf_context * ctx, int key_id);
-    GGML_API bool         gguf_get_val_bool(const struct gguf_context * ctx, int key_id);
-    GGML_API const char * gguf_get_val_str (const struct gguf_context * ctx, int key_id);
-    GGML_API const void * gguf_get_val_data(const struct gguf_context * ctx, int key_id);
-    GGML_API int          gguf_get_arr_n   (const struct gguf_context * ctx, int key_id);
-    GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
-    GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
-
-    GGML_API int            gguf_get_n_tensors    (const struct gguf_context * ctx);
-    GGML_API int            gguf_find_tensor      (const struct gguf_context * ctx, const char * name);
-    GGML_API size_t         gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
-    GGML_API char *         gguf_get_tensor_name  (const struct gguf_context * ctx, int i);
-    GGML_API enum ggml_type gguf_get_tensor_type  (const struct gguf_context * ctx, int i);
-
-    // overrides existing values or adds a new one
-    GGML_API void gguf_set_val_u8  (struct gguf_context * ctx, const char * key, uint8_t  val);
-    GGML_API void gguf_set_val_i8  (struct gguf_context * ctx, const char * key, int8_t   val);
-    GGML_API void gguf_set_val_u16 (struct gguf_context * ctx, const char * key, uint16_t val);
-    GGML_API void gguf_set_val_i16 (struct gguf_context * ctx, const char * key, int16_t  val);
-    GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
-    GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t  val);
-    GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float    val);
-    GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
-    GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t  val);
-    GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double   val);
-    GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool     val);
-    GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
-    GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
-    GGML_API void gguf_set_arr_str (struct gguf_context * ctx, const char * key, const char ** data, int n);
-
-    // set or add KV pairs from another context
-    GGML_API void gguf_set_kv(struct gguf_context * ctx, struct gguf_context * src);
-
-    // manage tensor info
-    GGML_API void gguf_add_tensor(struct gguf_context * ctx, const struct ggml_tensor * tensor);
-    GGML_API void gguf_set_tensor_type(struct gguf_context * ctx, const char * name, enum ggml_type type);
-    GGML_API void gguf_set_tensor_data(struct gguf_context * ctx, const char * name, const void * data, size_t size);
-
-    // writing gguf files can be done in 2 ways:
-    //
-    // - write the entire gguf_context to a binary file in a single pass:
-    //
-    //   gguf_write_to_file(ctx, fname);
-    //
-    // - first prepare a file with a placeholder for the meta data, write the tensor data, then write the meta data:
-    //
-    //   FILE * f = fopen(fname, "wb");
-    //   fseek(f, gguf_get_meta_size(ctx), SEEK_SET);
-    //   fwrite(f, ...);
-    //   void * data = gguf_meta_get_meta_data(ctx);
-    //   fseek(f, 0, SEEK_SET);
-    //   fwrite(f, data, gguf_get_meta_size(ctx));
-    //   free(data);
-    //   fclose(f);
-    //
-
-    // write the entire context to a binary file
-    GGML_API void gguf_write_to_file(const struct gguf_context * ctx, const char * fname, bool only_meta);
-
-    // get the size in bytes of the meta data (header, kv pairs, tensor info) including padding
-    GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
-    GGML_API void   gguf_get_meta_data(const struct gguf_context * ctx, void * data);
-
-    //
-    // system info
-    //
-
-    GGML_API int ggml_cpu_has_avx        (void);
-    GGML_API int ggml_cpu_has_avx2       (void);
-    GGML_API int ggml_cpu_has_avx512     (void);
-    GGML_API int ggml_cpu_has_avx512_vbmi(void);
-    GGML_API int ggml_cpu_has_avx512_vnni(void);
-    GGML_API int ggml_cpu_has_fma        (void);
-    GGML_API int ggml_cpu_has_neon       (void);
-    GGML_API int ggml_cpu_has_arm_fma    (void);
-    GGML_API int ggml_cpu_has_metal      (void);
-    GGML_API int ggml_cpu_has_f16c       (void);
-    GGML_API int ggml_cpu_has_fp16_va    (void);
-    GGML_API int ggml_cpu_has_wasm_simd  (void);
-    GGML_API int ggml_cpu_has_blas       (void);
-    GGML_API int ggml_cpu_has_cublas     (void);
-    GGML_API int ggml_cpu_has_clblast    (void);
-    GGML_API int ggml_cpu_has_gpublas    (void);
-    GGML_API int ggml_cpu_has_sse3       (void);
-    GGML_API int ggml_cpu_has_ssse3      (void);
-    GGML_API int ggml_cpu_has_vsx        (void);
-
-    //
-    // Internal types and functions exposed for tests and benchmarks
-    //
-
-#ifdef  __cplusplus
-// restrict not standard in C++
-#define GGML_RESTRICT
-#else
-#define GGML_RESTRICT restrict
-#endif
-    typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int k);
-    typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
-    typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
-
-    typedef struct dpct_type_994041 {
-        const char      * type_name;
-        int               blck_size;
-        size_t            type_size;
-        bool              is_quantized;
-        ggml_to_float_t   to_float;
-        ggml_from_float_t from_float;
-        ggml_from_float_t from_float_reference;
-        ggml_vec_dot_t    vec_dot;
-        enum ggml_type    vec_dot_type;
-    } ggml_type_traits_t;
-
-    GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
-
-#ifdef  __cplusplus
-}
-#endif
diff --git a/dpcpp_out2/ggml.h.yaml b/dpcpp_out2/ggml.h.yaml
deleted file mode 100644
index 47d52a213f243..0000000000000
--- a/dpcpp_out2/ggml.h.yaml
+++ /dev/null
@@ -1,100 +0,0 @@
----
-MainSourceFile:  '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/dpcpp_out2/ggml.h'
-Replacements:
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h'
-    Offset:          7458
-    Length:          0
-    ReplacementText: "#define DPCT_PROFILING_ENABLED\n#include <sycl/sycl.hpp>\n#include <dpct/dpct.hpp>\n"
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h'
-    Offset:          10556
-    Length:          10
-    ReplacementText: SYCL_LANGUAGE_VERSION
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-  - FilePath:        '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h'
-    Offset:          82284
-    Length:          0
-    ReplacementText: ' dpct_type_994041'
-    ConstantFlag:    ''
-    ConstantOffset:  0
-    InitStr:         ''
-    NewHostVarName:  ''
-    BlockLevelFormatFlag: false
-MainSourceFilesDigest:
-  - MainSourceFile:  '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub/ggml.h'
-    Digest:          a7f88ed7f3bbff01c9713ad58f5dac5b
-DpctVersion:     18.0.0
-MainHelperFileName: ''
-USMLevel:        ''
-FeatureMap:      {}
-CompileTargets:  {}
-OptionMap:
-  AnalysisScopePath:
-    Value:           '/ws1/jianyuzh/ws1/llama.cpp/llama.cpp_pub'
-    Specified:       false
-  AsyncHandler:
-    Value:           'false'
-    Specified:       false
-  CommentsEnabled:
-    Value:           'false'
-    Specified:       false
-  CompilationsDir:
-    Value:           ''
-    Specified:       false
-  CtadEnabled:
-    Value:           'false'
-    Specified:       false
-  EnablepProfiling:
-    Value:           'true'
-    Specified:       true
-  ExperimentalFlag:
-    Value:           '0'
-    Specified:       false
-  ExplicitClNamespace:
-    Value:           'false'
-    Specified:       false
-  ExplicitNamespace:
-    Value:           '20'
-    Specified:       false
-  ExtensionDDFlag:
-    Value:           '0'
-    Specified:       false
-  ExtensionDEFlag:
-    Value:           '4294967295'
-    Specified:       false
-  HelperFuncPreferenceFlag:
-    Value:           '0'
-    Specified:       false
-  NDRangeDim:
-    Value:           '3'
-    Specified:       false
-  NoDRYPattern:
-    Value:           'false'
-    Specified:       false
-  NoUseGenericSpace:
-    Value:           ''
-    Specified:       true
-  OptimizeMigration:
-    Value:           'false'
-    Specified:       false
-  ProcessAll:
-    Value:           'false'
-    Specified:       false
-  RuleFile:
-    Value:           ''
-    Specified:       false
-  SyclNamedLambda:
-    Value:           'false'
-    Specified:       false
-  UsmLevel:
-    Value:           '1'
-    Specified:       false
-...
diff --git a/dpct.hpp b/dpct.hpp
deleted file mode 100644
index 874fa13094ae5..0000000000000
--- a/dpct.hpp
+++ /dev/null
@@ -1,2831 +0,0 @@
-// COPY from DPCT head files
-// To clear the code, copy/paste the variable/macro/function from following files.
-// It' possible to get better performance from newer function version DPCT head files.
-// #include <dpct/dpct.hpp>
-// #include <dpct/blas_utils.hpp>
-// #include <dpct/lib_common_utils.hpp>
-// #include <dpct/math.hpp>
-// #include <dpct/utils.hpp>
-// #include <dpct/memory.hpp>
-// #include <dpct/device.hpp>
-// #include <dpct/lapack_utils.hpp>
-
-
-#include <algorithm>
-#include <array>
-#include <cstring>
-#include <iostream>
-#include <mutex>
-#include <set>
-#include <sstream>
-#include <map>
-#include <vector>
-#include <thread>
-
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <mutex>
-#include <unordered_map>
-#include <map>
-#include <utility>
-#include <thread>
-#include <type_traits>
-
-#include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
-
-#if defined(__linux__)
-#include <sys/mman.h>
-#elif defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#else
-#error "Only support Windows and Linux."
-#endif
-
-#if defined(__linux__)
-#include <unistd.h>
-#include <sys/syscall.h>
-#endif
-#if defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-#define DPCT_COMPATIBILITY_TEMP (900)
-
-#if defined(_MSC_VER)
-#define __dpct_align__(n) __declspec(align(n))
-#define __dpct_inline__ __forceinline
-#else
-#define __dpct_align__(n) __attribute__((aligned(n)))
-#define __dpct_inline__ __inline__ __attribute__((always_inline))
-#endif
-
-#if defined(_MSC_VER)
-#define __dpct_noinline__ __declspec(noinline)
-#else
-#define __dpct_noinline__ __attribute__((noinline))
-#endif
-
-namespace dpct
-{
-  typedef sycl::queue *queue_ptr;
-  typedef sycl::event *event_ptr;
-  typedef char *device_ptr;
-  typedef uint8_t byte_t;
-  typedef sycl::buffer<byte_t> buffer_t;
-
-  /// SYCL default exception handler
-  inline auto exception_handler = [](sycl::exception_list exceptions)
-  {
-    for (std::exception_ptr const &e : exceptions)
-    {
-      try
-      {
-        std::rethrow_exception(e);
-      }
-      catch (sycl::exception const &e)
-      {
-        std::cerr << "Caught asynchronous SYCL exception:" << std::endl
-                  << e.what() << std::endl
-                  << "Exception caught at file:" << __FILE__
-                  << ", line:" << __LINE__ << std::endl;
-      }
-    }
-  };
-
-  enum error_code
-  {
-    success = 0,
-    default_error = 999
-  };
-
-  enum memcpy_direction
-  {
-    host_to_host,
-    host_to_device,
-    device_to_host,
-    device_to_device,
-    automatic
-  };
-
-  enum memory_region
-  {
-    global = 0, // device global memory
-    constant,   // device constant memory
-    local,      // device local memory
-    shared,     // memory which can be accessed by host and device
-  };
-
-  enum class library_data_t : unsigned char
-  {
-    real_float = 0,
-    complex_float,
-    real_double,
-    complex_double,
-    real_half,
-    complex_half,
-    real_bfloat16,
-    complex_bfloat16,
-    real_int4,
-    complex_int4,
-    real_uint4,
-    complex_uint4,
-    real_int8,
-    complex_int8,
-    real_uint8,
-    complex_uint8,
-    real_int16,
-    complex_int16,
-    real_uint16,
-    complex_uint16,
-    real_int32,
-    complex_int32,
-    real_uint32,
-    complex_uint32,
-    real_int64,
-    complex_int64,
-    real_uint64,
-    complex_uint64,
-    real_int8_4,
-    real_int8_32,
-    real_uint8_4,
-    library_data_t_size
-  };
-
-  template <typename T>
-  struct DataType
-  {
-    using T2 = T;
-  };
-  template <typename T>
-  struct DataType<sycl::vec<T, 2>>
-  {
-    using T2 = std::complex<T>;
-  };
-
-  static void destroy_event(event_ptr event)
-  {
-    delete event;
-  }
-
-  static inline unsigned int get_tid()
-  {
-#if defined(__linux__)
-    return syscall(SYS_gettid);
-#elif defined(_WIN64)
-    return GetCurrentThreadId();
-#else
-#error "Only support Windows and Linux."
-#endif
-  }
-
-  namespace detail
-  {
-    static void get_version(const sycl::device &dev, int &major, int &minor)
-    {
-      // Version string has the following format:
-      // a. OpenCL<space><major.minor><space><vendor-specific-information>
-      // b. <major.minor>
-      std::string ver;
-      ver = dev.get_info<sycl::info::device::version>();
-      std::string::size_type i = 0;
-      while (i < ver.size())
-      {
-        if (isdigit(ver[i]))
-          break;
-        i++;
-      }
-      major = std::stoi(&(ver[i]));
-      while (i < ver.size())
-      {
-        if (ver[i] == '.')
-          break;
-        i++;
-      }
-      i++;
-      minor = std::stoi(&(ver[i]));
-    }
-
-    template <typename tag, typename T>
-    class generic_error_type
-    {
-    public:
-      generic_error_type() = default;
-      generic_error_type(T value) : value{value} {}
-      operator T() const { return value; }
-
-    private:
-      T value;
-    };
-
-  } // namespace detail
-
-  /// Pitched 2D/3D memory data.
-  class pitched_data
-  {
-  public:
-    pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
-    pitched_data(void *data, size_t pitch, size_t x, size_t y)
-        : _data(data), _pitch(pitch), _x(x), _y(y) {}
-
-    void *get_data_ptr() { return _data; }
-    void set_data_ptr(void *data) { _data = data; }
-
-    size_t get_pitch() { return _pitch; }
-    void set_pitch(size_t pitch) { _pitch = pitch; }
-
-    size_t get_x() { return _x; }
-    void set_x(size_t x) { _x = x; };
-
-    size_t get_y() { return _y; }
-    void set_y(size_t y) { _y = y; }
-
-  private:
-    void *_data;
-    size_t _pitch, _x, _y;
-  };
-
-  class device_info
-  {
-  public:
-    // get interface
-    const char *get_name() const { return _name; }
-    char *get_name() { return _name; }
-    template <typename WorkItemSizesTy = sycl::range<3>,
-              std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                   std::is_same_v<WorkItemSizesTy, int *>,
-                               int> = 0>
-    auto get_max_work_item_sizes() const
-    {
-      if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-        return sycl::range<3>(_max_work_item_sizes_i[0],
-                              _max_work_item_sizes_i[1],
-                              _max_work_item_sizes_i[2]);
-      else
-      {
-        return _max_work_item_sizes_i;
-      }
-    }
-    template <typename WorkItemSizesTy = sycl::range<3>,
-              std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                   std::is_same_v<WorkItemSizesTy, int *>,
-                               int> = 0>
-    auto get_max_work_item_sizes()
-    {
-      if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-        return sycl::range<3>(_max_work_item_sizes_i[0],
-                              _max_work_item_sizes_i[1],
-                              _max_work_item_sizes_i[2]);
-      else
-      {
-        return _max_work_item_sizes_i;
-      }
-    }
-    bool get_host_unified_memory() const { return _host_unified_memory; }
-    int get_major_version() const { return _major; }
-    int get_minor_version() const { return _minor; }
-    int get_integrated() const { return _integrated; }
-    int get_max_clock_frequency() const { return _frequency; }
-    int get_max_compute_units() const { return _max_compute_units; }
-    int get_max_work_group_size() const { return _max_work_group_size; }
-    int get_max_sub_group_size() const { return _max_sub_group_size; }
-    int get_max_work_items_per_compute_unit() const
-    {
-      return _max_work_items_per_compute_unit;
-    }
-    int get_max_register_size_per_work_group() const
-    {
-      return _max_register_size_per_work_group;
-    }
-    template <typename NDRangeSizeTy = size_t *,
-              std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                   std::is_same_v<NDRangeSizeTy, int *>,
-                               int> = 0>
-    auto get_max_nd_range_size() const
-    {
-      if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-        return _max_nd_range_size;
-      else
-        return _max_nd_range_size_i;
-    }
-    template <typename NDRangeSizeTy = size_t *,
-              std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                   std::is_same_v<NDRangeSizeTy, int *>,
-                               int> = 0>
-    auto get_max_nd_range_size()
-    {
-      if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-        return _max_nd_range_size;
-      else
-        return _max_nd_range_size_i;
-    }
-    size_t get_global_mem_size() const { return _global_mem_size; }
-    size_t get_local_mem_size() const { return _local_mem_size; }
-    /// Returns the maximum clock rate of device's global memory in kHz. If
-    /// compiler does not support this API then returns default value 3200000 kHz.
-    unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
-    /// Returns the maximum bus width between device and memory in bits. If
-    /// compiler does not support this API then returns default value 64 bits.
-    unsigned int get_memory_bus_width() const { return _memory_bus_width; }
-    uint32_t get_device_id() const { return _device_id; }
-    std::array<unsigned char, 16> get_uuid() const { return _uuid; }
-    /// Returns global memory cache size in bytes.
-    unsigned int get_global_mem_cache_size() const
-    {
-      return _global_mem_cache_size;
-    }
-
-    // set interface
-    void set_name(const char *name)
-    {
-      size_t length = strlen(name);
-      if (length < 256)
-      {
-        std::memcpy(_name, name, length + 1);
-      }
-      else
-      {
-        std::memcpy(_name, name, 255);
-        _name[255] = '\0';
-      }
-    }
-    void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes)
-    {
-      for (int i = 0; i < 3; ++i)
-        _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-    }
-    [[deprecated]] void
-    set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
-    {
-      for (int i = 0; i < 3; ++i)
-      {
-        _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-      }
-    }
-    void set_host_unified_memory(bool host_unified_memory)
-    {
-      _host_unified_memory = host_unified_memory;
-    }
-    void set_major_version(int major) { _major = major; }
-    void set_minor_version(int minor) { _minor = minor; }
-    void set_integrated(int integrated) { _integrated = integrated; }
-    void set_max_clock_frequency(int frequency) { _frequency = frequency; }
-    void set_max_compute_units(int max_compute_units)
-    {
-      _max_compute_units = max_compute_units;
-    }
-    void set_global_mem_size(size_t global_mem_size)
-    {
-      _global_mem_size = global_mem_size;
-    }
-    void set_local_mem_size(size_t local_mem_size)
-    {
-      _local_mem_size = local_mem_size;
-    }
-    void set_max_work_group_size(int max_work_group_size)
-    {
-      _max_work_group_size = max_work_group_size;
-    }
-    void set_max_sub_group_size(int max_sub_group_size)
-    {
-      _max_sub_group_size = max_sub_group_size;
-    }
-    void
-    set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
-    {
-      _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
-    }
-    void set_max_nd_range_size(int max_nd_range_size[])
-    {
-      for (int i = 0; i < 3; i++)
-      {
-        _max_nd_range_size[i] = max_nd_range_size[i];
-        _max_nd_range_size_i[i] = max_nd_range_size[i];
-      }
-    }
-    void set_memory_clock_rate(unsigned int memory_clock_rate)
-    {
-      _memory_clock_rate = memory_clock_rate;
-    }
-    void set_memory_bus_width(unsigned int memory_bus_width)
-    {
-      _memory_bus_width = memory_bus_width;
-    }
-    void
-    set_max_register_size_per_work_group(int max_register_size_per_work_group)
-    {
-      _max_register_size_per_work_group = max_register_size_per_work_group;
-    }
-    void set_device_id(uint32_t device_id)
-    {
-      _device_id = device_id;
-    }
-    void set_uuid(std::array<unsigned char, 16> uuid)
-    {
-      _uuid = std::move(uuid);
-    }
-    void set_global_mem_cache_size(unsigned int global_mem_cache_size)
-    {
-      _global_mem_cache_size = global_mem_cache_size;
-    }
-
-  private:
-    char _name[256];
-    int _max_work_item_sizes_i[3];
-    bool _host_unified_memory = false;
-    int _major;
-    int _minor;
-    int _integrated = 0;
-    int _frequency;
-    // Set estimated value 3200000 kHz as default value.
-    unsigned int _memory_clock_rate = 3200000;
-    // Set estimated value 64 bits as default value.
-    unsigned int _memory_bus_width = 64;
-    unsigned int _global_mem_cache_size;
-    int _max_compute_units;
-    int _max_work_group_size;
-    int _max_sub_group_size;
-    int _max_work_items_per_compute_unit;
-    int _max_register_size_per_work_group;
-    size_t _global_mem_size;
-    size_t _local_mem_size;
-    size_t _max_nd_range_size[3];
-    int _max_nd_range_size_i[3];
-    uint32_t _device_id;
-    std::array<unsigned char, 16> _uuid;
-  };
-
-  static int get_major_version(const sycl::device &dev)
-  {
-    int major, minor;
-    detail::get_version(dev, major, minor);
-    return major;
-  }
-
-  static int get_minor_version(const sycl::device &dev)
-  {
-    int major, minor;
-    detail::get_version(dev, major, minor);
-    return minor;
-  }
-
-  static void get_device_info(device_info &out, const sycl::device &dev)
-  {
-    device_info prop;
-    prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
-
-    int major, minor;
-    detail::get_version(dev, major, minor);
-    prop.set_major_version(major);
-    prop.set_minor_version(minor);
-
-    prop.set_max_work_item_sizes(
-#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
-        // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
-        // is an enum class element
-        dev.get_info<sycl::info::device::max_work_item_sizes>());
-#else
-        // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
-        // an int
-        dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
-#endif
-    prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
-
-    prop.set_max_clock_frequency(
-        dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
-
-    prop.set_max_compute_units(
-        dev.get_info<sycl::info::device::max_compute_units>());
-    prop.set_max_work_group_size(
-        dev.get_info<sycl::info::device::max_work_group_size>());
-    prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
-    prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
-
-#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
-    if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
-    {
-      unsigned int tmp =
-          dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
-      if (tmp != 0)
-        prop.set_memory_clock_rate(1000 * tmp);
-    }
-    if (dev.has(sycl::aspect::ext_intel_memory_bus_width))
-    {
-      prop.set_memory_bus_width(
-          dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
-    }
-    if (dev.has(sycl::aspect::ext_intel_device_id))
-    {
-      prop.set_device_id(
-          dev.get_info<sycl::ext::intel::info::device::device_id>());
-    }
-    if (dev.has(sycl::aspect::ext_intel_device_info_uuid))
-    {
-      prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
-    }
-#elif defined(_MSC_VER) && !defined(__clang__)
-#pragma message("get_device_info: querying memory_clock_rate and \
-        memory_bus_width are not supported by the compiler used. \
-        Use 3200000 kHz as memory_clock_rate default value. \
-        Use 64 bits as memory_bus_width default value.")
-#else
-#warning "get_device_info: querying memory_clock_rate and \
-        memory_bus_width are not supported by the compiler used. \
-        Use 3200000 kHz as memory_clock_rate default value. \
-        Use 64 bits as memory_bus_width default value."
-#endif
-
-    size_t max_sub_group_size = 1;
-    std::vector<size_t> sub_group_sizes =
-        dev.get_info<sycl::info::device::sub_group_sizes>();
-
-    for (const auto &sub_group_size : sub_group_sizes)
-    {
-      if (max_sub_group_size < sub_group_size)
-        max_sub_group_size = sub_group_size;
-    }
-
-    prop.set_max_sub_group_size(max_sub_group_size);
-
-    prop.set_max_work_items_per_compute_unit(
-        dev.get_info<sycl::info::device::max_work_group_size>());
-    int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
-    prop.set_max_nd_range_size(max_nd_range_size);
-
-    // Estimates max register size per work group, feel free to update the value
-    // according to device properties.
-    prop.set_max_register_size_per_work_group(65536);
-
-    prop.set_global_mem_cache_size(
-        dev.get_info<sycl::info::device::global_mem_cache_size>());
-    out = prop;
-  }
-
-  /// dpct device extension
-  class device_ext : public sycl::device
-  {
-    typedef std::mutex mutex_type;
-
-  public:
-    device_ext() : sycl::device(), _ctx(*this) {}
-    ~device_ext()
-    {
-      std::lock_guard<mutex_type> lock(m_mutex);
-      clear_queues();
-    }
-    device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
-    {
-      std::lock_guard<mutex_type> lock(m_mutex);
-      init_queues();
-    }
-
-    int is_native_atomic_supported() { return 0; }
-    int get_major_version() const
-    {
-      return dpct::get_major_version(*this);
-    }
-
-    int get_minor_version() const
-    {
-      return dpct::get_minor_version(*this);
-    }
-
-    int get_max_compute_units() const
-    {
-      return get_device_info().get_max_compute_units();
-    }
-
-    /// Return the maximum clock frequency of this device in KHz.
-    int get_max_clock_frequency() const
-    {
-      return get_device_info().get_max_clock_frequency();
-    }
-
-    int get_integrated() const { return get_device_info().get_integrated(); }
-
-    int get_max_sub_group_size() const
-    {
-      return get_device_info().get_max_sub_group_size();
-    }
-
-    int get_max_register_size_per_work_group() const
-    {
-      return get_device_info().get_max_register_size_per_work_group();
-    }
-
-    int get_max_work_group_size() const
-    {
-      return get_device_info().get_max_work_group_size();
-    }
-
-    int get_mem_base_addr_align() const
-    {
-      return get_info<sycl::info::device::mem_base_addr_align>();
-    }
-
-    size_t get_global_mem_size() const
-    {
-      return get_device_info().get_global_mem_size();
-    }
-
-    /// Get the number of bytes of free and total memory on the SYCL device.
-    /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
-    /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
-    void get_memory_info(size_t &free_memory, size_t &total_memory)
-    {
-#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
-      if (!has(sycl::aspect::ext_intel_free_memory))
-      {
-        std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-        free_memory = 0;
-      }
-      else
-      {
-        free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
-      }
-#else
-      std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-      free_memory = 0;
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma message("Querying the number of bytes of free memory is not supported")
-#else
-#warning "Querying the number of bytes of free memory is not supported"
-#endif
-#endif
-      total_memory = get_device_info().get_global_mem_size();
-    }
-
-    void get_device_info(device_info &out) const
-    {
-      dpct::get_device_info(out, *this);
-    }
-
-    device_info get_device_info() const
-    {
-      device_info prop;
-      dpct::get_device_info(prop, *this);
-      return prop;
-    }
-
-    void reset()
-    {
-      std::lock_guard<mutex_type> lock(m_mutex);
-      clear_queues();
-      init_queues();
-    }
-
-    sycl::queue &in_order_queue() { return *_q_in_order; }
-
-    sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
-
-    sycl::queue &default_queue()
-    {
-#ifdef DPCT_USM_LEVEL_NONE
-      return out_of_order_queue();
-#else
-      return in_order_queue();
-#endif // DPCT_USM_LEVEL_NONE
-    }
-
-    void queues_wait_and_throw()
-    {
-      std::unique_lock<mutex_type> lock(m_mutex);
-      std::vector<std::shared_ptr<sycl::queue>> current_queues(
-          _queues);
-      lock.unlock();
-      for (const auto &q : current_queues)
-      {
-        q->wait_and_throw();
-      }
-      // Guard the destruct of current_queues to make sure the ref count is safe.
-      lock.lock();
-    }
-
-    sycl::queue *create_queue(bool enable_exception_handler = false)
-    {
-#ifdef DPCT_USM_LEVEL_NONE
-      return create_out_of_order_queue(enable_exception_handler);
-#else
-      return create_in_order_queue(enable_exception_handler);
-#endif // DPCT_USM_LEVEL_NONE
-    }
-
-    sycl::queue *create_in_order_queue(bool enable_exception_handler = false)
-    {
-      std::lock_guard<mutex_type> lock(m_mutex);
-      return create_queue_impl(enable_exception_handler,
-                               sycl::property::queue::in_order());
-    }
-
-    sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false)
-    {
-      std::lock_guard<mutex_type> lock(m_mutex);
-      return create_queue_impl(enable_exception_handler);
-    }
-
-    void destroy_queue(sycl::queue *&queue)
-    {
-      std::lock_guard<mutex_type> lock(m_mutex);
-      _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
-                                   [=](const std::shared_ptr<sycl::queue> &q) -> bool
-                                   {
-                                     return q.get() == queue;
-                                   }),
-                    _queues.end());
-      queue = nullptr;
-    }
-    void set_saved_queue(sycl::queue *q)
-    {
-      std::lock_guard<mutex_type> lock(m_mutex);
-      _saved_queue = q;
-    }
-    sycl::queue *get_saved_queue() const
-    {
-      std::lock_guard<mutex_type> lock(m_mutex);
-      return _saved_queue;
-    }
-    sycl::context get_context() const { return _ctx; }
-
-  private:
-    void clear_queues()
-    {
-      _queues.clear();
-      _q_in_order = _q_out_of_order = _saved_queue = nullptr;
-    }
-
-    void init_queues()
-    {
-      _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
-      _q_out_of_order = create_queue_impl(true);
-      _saved_queue = &default_queue();
-    }
-
-    /// Caller should acquire resource \p m_mutex before calling this function.
-    template <class... Properties>
-    sycl::queue *create_queue_impl(bool enable_exception_handler,
-                                   Properties... properties)
-    {
-      sycl::async_handler eh = {};
-      if (enable_exception_handler)
-      {
-        eh = exception_handler;
-      }
-      _queues.push_back(std::make_shared<sycl::queue>(
-          _ctx, *this, eh,
-          sycl::property_list(
-#ifdef DPCT_PROFILING_ENABLED
-              sycl::property::queue::enable_profiling(),
-#endif
-              properties...)));
-
-      return _queues.back().get();
-    }
-
-    void get_version(int &major, int &minor) const
-    {
-      detail::get_version(*this, major, minor);
-    }
-    sycl::queue *_q_in_order, *_q_out_of_order;
-    sycl::queue *_saved_queue;
-    sycl::context _ctx;
-    std::vector<std::shared_ptr<sycl::queue>> _queues;
-    mutable mutex_type m_mutex;
-  };
-
-  /// device manager
-  class dev_mgr
-  {
-  public:
-    device_ext &current_device()
-    {
-      unsigned int dev_id = current_device_id();
-      check_id(dev_id);
-      return *_devs[dev_id];
-    }
-    device_ext &cpu_device() const
-    {
-      std::lock_guard<std::recursive_mutex> lock(m_mutex);
-      if (_cpu_device == -1)
-      {
-        throw std::runtime_error("no valid cpu device");
-      }
-      else
-      {
-        return *_devs[_cpu_device];
-      }
-    }
-    device_ext &get_device(unsigned int id) const
-    {
-      std::lock_guard<std::recursive_mutex> lock(m_mutex);
-      check_id(id);
-      return *_devs[id];
-    }
-    unsigned int current_device_id() const
-    {
-      std::lock_guard<std::recursive_mutex> lock(m_mutex);
-      auto it = _thread2dev_map.find(get_tid());
-      if (it != _thread2dev_map.end())
-        return it->second;
-      return DEFAULT_DEVICE_ID;
-    }
-
-    /// Select device with a device ID.
-    /// \param [in] id The id of the device which can
-    /// be obtained through get_device_id(const sycl::device).
-    void select_device(unsigned int id)
-    {
-      std::lock_guard<std::recursive_mutex> lock(m_mutex);
-      check_id(id);
-      _thread2dev_map[get_tid()] = id;
-    }
-    unsigned int device_count() { return _devs.size(); }
-
-    unsigned int get_device_id(const sycl::device &dev)
-    {
-      unsigned int id = 0;
-      for (auto dev_item : _devs)
-      {
-        if (*dev_item == dev)
-        {
-          break;
-        }
-        id++;
-      }
-      return id;
-    }
-
-    template <class DeviceSelector>
-    std::enable_if_t<
-        std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-    select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
-    {
-      sycl::device selected_device = sycl::device(selector);
-      unsigned int selected_device_id = get_device_id(selected_device);
-      select_device(selected_device_id);
-    }
-
-    /// Returns the instance of device manager singleton.
-    static dev_mgr &instance()
-    {
-      static dev_mgr d_m;
-      return d_m;
-    }
-    dev_mgr(const dev_mgr &) = delete;
-    dev_mgr &operator=(const dev_mgr &) = delete;
-    dev_mgr(dev_mgr &&) = delete;
-    dev_mgr &operator=(dev_mgr &&) = delete;
-
-  private:
-    mutable std::recursive_mutex m_mutex;
-    dev_mgr()
-    {
-      sycl::device default_device =
-          sycl::device(sycl::default_selector_v);
-      _devs.push_back(std::make_shared<device_ext>(default_device));
-
-      std::vector<sycl::device> sycl_all_devs =
-          sycl::device::get_devices(sycl::info::device_type::all);
-      // Collect other devices except for the default device.
-      if (default_device.is_cpu())
-        _cpu_device = 0;
-      for (auto &dev : sycl_all_devs)
-      {
-        if (dev == default_device)
-        {
-          continue;
-        }
-        _devs.push_back(std::make_shared<device_ext>(dev));
-        if (_cpu_device == -1 && dev.is_cpu())
-        {
-          _cpu_device = _devs.size() - 1;
-        }
-      }
-    }
-    void check_id(unsigned int id) const
-    {
-      if (id >= _devs.size())
-      {
-        throw std::runtime_error("invalid device id");
-      }
-    }
-    std::vector<std::shared_ptr<device_ext>> _devs;
-    /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
-    /// thread id in _thread2dev_map, which means default device should be used
-    /// for the current thread.
-    const unsigned int DEFAULT_DEVICE_ID = 0;
-    /// thread-id to device-id map.
-    std::map<unsigned int, unsigned int> _thread2dev_map;
-    int _cpu_device = -1;
-  };
-
-  static inline sycl::queue &get_default_queue()
-  {
-    return dev_mgr::instance().current_device().default_queue();
-  }
-
-  namespace detail
-  {
-    enum class pointer_access_attribute
-    {
-      host_only = 0,
-      device_only,
-      host_device,
-      end
-    };
-
-    static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
-                                                          const void *ptr)
-    {
-#ifdef DPCT_USM_LEVEL_NONE
-      return mem_mgr::instance().is_device_ptr(ptr)
-                 ? pointer_access_attribute::device_only
-                 : pointer_access_attribute::host_only;
-#else
-      switch (sycl::get_pointer_type(ptr, q.get_context()))
-      {
-      case sycl::usm::alloc::unknown:
-        return pointer_access_attribute::host_only;
-      case sycl::usm::alloc::device:
-        return pointer_access_attribute::device_only;
-      case sycl::usm::alloc::shared:
-      case sycl::usm::alloc::host:
-        return pointer_access_attribute::host_device;
-      }
-#endif
-    }
-
-    template <typename ArgT>
-    inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
-    {
-      static_assert((unsigned char)library_data_t::library_data_t_size <=
-                        std::numeric_limits<unsigned char>::max() &&
-                    "library_data_t size exceeds limit.");
-      static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
-      return (std::uint64_t)Val;
-    }
-
-    template <typename FirstT, typename... RestT>
-    inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
-                                                           RestT... RestVal)
-    {
-      static_assert((std::uint8_t)library_data_t::library_data_t_size <=
-                        std::numeric_limits<unsigned char>::max() &&
-                    "library_data_t size exceeds limit.");
-      static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
-      static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
-      return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
-    }
-
-    class mem_mgr
-    {
-      mem_mgr()
-      {
-        // Reserved address space, no real memory allocation happens here.
-#if defined(__linux__)
-        mapped_address_space =
-            (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
-                           MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-#elif defined(_WIN64)
-        mapped_address_space = (byte_t *)VirtualAlloc(
-            NULL,               // NULL specified as the base address parameter
-            mapped_region_size, // Size of allocation
-            MEM_RESERVE,        // Allocate reserved pages
-            PAGE_NOACCESS);     // Protection = no access
-#else
-#error "Only support Windows and Linux."
-#endif
-        next_free = mapped_address_space;
-      };
-
-    public:
-      using buffer_id_t = int;
-
-      struct allocation
-      {
-        buffer_t buffer;
-        byte_t *alloc_ptr;
-        size_t size;
-      };
-
-      ~mem_mgr()
-      {
-#if defined(__linux__)
-        munmap(mapped_address_space, mapped_region_size);
-#elif defined(_WIN64)
-        VirtualFree(mapped_address_space, 0, MEM_RELEASE);
-#else
-#error "Only support Windows and Linux."
-#endif
-      };
-
-      mem_mgr(const mem_mgr &) = delete;
-      mem_mgr &operator=(const mem_mgr &) = delete;
-      mem_mgr(mem_mgr &&) = delete;
-      mem_mgr &operator=(mem_mgr &&) = delete;
-
-      /// Allocate
-      void *mem_alloc(size_t size)
-      {
-        if (!size)
-          return nullptr;
-        std::lock_guard<std::mutex> lock(m_mutex);
-        if (next_free + size > mapped_address_space + mapped_region_size)
-        {
-          throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
-        }
-        // Allocation
-        sycl::range<1> r(size);
-        buffer_t buf(r);
-        allocation A{buf, next_free, size};
-        // Map allocation to device pointer
-        void *result = next_free;
-        m_map.emplace(next_free + size, A);
-        // Update pointer to the next free space.
-        next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
-
-        return result;
-      }
-
-      /// Deallocate
-      void mem_free(const void *ptr)
-      {
-        if (!ptr)
-          return;
-        std::lock_guard<std::mutex> lock(m_mutex);
-        auto it = get_map_iterator(ptr);
-        m_map.erase(it);
-      }
-
-      /// map: device pointer -> allocation(buffer, alloc_ptr, size)
-      allocation translate_ptr(const void *ptr)
-      {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        auto it = get_map_iterator(ptr);
-        return it->second;
-      }
-
-      /// Check if the pointer represents device pointer or not.
-      bool is_device_ptr(const void *ptr) const
-      {
-        std::lock_guard<std::mutex> lock(m_mutex);
-        return (mapped_address_space <= ptr) &&
-               (ptr < mapped_address_space + mapped_region_size);
-      }
-
-      /// Returns the instance of memory manager singleton.
-      static mem_mgr &instance()
-      {
-        static mem_mgr m;
-        return m;
-      }
-
-    private:
-      std::map<byte_t *, allocation> m_map;
-      mutable std::mutex m_mutex;
-      byte_t *mapped_address_space;
-      byte_t *next_free;
-      const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
-      const size_t alignment = 256;
-      /// This padding may be defined to some positive value to debug
-      /// out of bound accesses.
-      const size_t extra_padding = 0;
-
-      std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
-      {
-        auto it = m_map.upper_bound((byte_t *)ptr);
-        if (it == m_map.end())
-        {
-          // Not a virtual pointer.
-          throw std::runtime_error("can not get buffer from non-virtual pointer");
-        }
-        const allocation &alloc = it->second;
-        if (ptr < alloc.alloc_ptr)
-        {
-          // Out of bound.
-          // This may happen if there's a gap between allocations due to alignment
-          // or extra padding and pointer points to this gap.
-          throw std::runtime_error("invalid virtual pointer");
-        }
-        return it;
-      }
-    };
-
-    template <class T, memory_region Memory, size_t Dimension>
-    class accessor;
-    template <memory_region Memory, class T = byte_t>
-    class memory_traits
-    {
-    public:
-      static constexpr sycl::access::target target =
-          sycl::access::target::device;
-      static constexpr sycl::access_mode mode =
-          (Memory == constant) ? sycl::access_mode::read
-                               : sycl::access_mode::read_write;
-      static constexpr size_t type_size = sizeof(T);
-      using element_t =
-          typename std::conditional<Memory == constant, const T, T>::type;
-      using value_t = typename std::remove_cv<T>::type;
-      template <size_t Dimension = 1>
-      using accessor_t = typename std::conditional<
-          Memory == local, sycl::local_accessor<value_t, Dimension>,
-          sycl::accessor<T, Dimension, mode, target>>::type;
-      using pointer_t = T *;
-    };
-
-    static inline void *dpct_malloc(size_t size, sycl::queue &q)
-    {
-#ifdef DPCT_USM_LEVEL_NONE
-      return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
-#else
-      return sycl::malloc_device(size, q.get_device(), q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
-    }
-
-#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
-    static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
-                                    sycl::queue &q)
-    {
-      pitch = PITCH_DEFAULT_ALIGN(x);
-      return dpct_malloc(pitch * y * z, q);
-    }
-
-    /**
-     * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
-     * @tparam valueT The type of the element to be set.
-     * @param [in] q The queue in which the operation is done.
-     * @param [in] dev_ptr Pointer to the virtual device memory address.
-     * @param [in] value The value to be set.
-     * @param [in] size Number of elements to be set to the value.
-     * @return An event representing the memset operation.
-     */
-    template <typename valueT>
-    static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
-                                          valueT value, size_t size)
-    {
-#ifdef DPCT_USM_LEVEL_NONE
-      auto &mm = mem_mgr::instance();
-      assert(mm.is_device_ptr(dev_ptr));
-      auto alloc = mm.translate_ptr(dev_ptr);
-      size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
-
-      return q.submit([&](sycl::handler &cgh)
-                      {
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    auto new_buffer = alloc.buffer.reinterpret<valueT>(
-        sycl::range<1>(alloc.size / sizeof(valueT)));
-    sycl::accessor<valueT, 1, sycl::access_mode::write,
-                   sycl::access::target::device>
-        acc(new_buffer, cgh, r, o);
-    cgh.fill(acc, value); });
-#else
-      return q.fill(dev_ptr, value, size);
-#endif // DPCT_USM_LEVEL_NONE
-    }
-
-    /**
-     * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
-     * @tparam valueT The type of the element to be set.
-     * @param [in] q The queue in which the operation is done.
-     * @param [in] data Pointer to the pitched device memory region.
-     * @param [in] value The value to be set.
-     * @param [in] size 3D memory region by number of elements.
-     * @return An event list representing the memset operations.
-     */
-    template <typename valueT>
-    static inline std::vector<sycl::event>
-    dpct_memset(sycl::queue &q, pitched_data data, valueT value,
-                sycl::range<3> size)
-    {
-      std::vector<sycl::event> event_list;
-      size_t slice = data.get_pitch() * data.get_y();
-      unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
-      for (size_t z = 0; z < size.get(2); ++z)
-      {
-        unsigned char *data_ptr = data_surface;
-        for (size_t y = 0; y < size.get(1); ++y)
-        {
-          event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
-          data_ptr += data.get_pitch();
-        }
-        data_surface += slice;
-      }
-      return event_list;
-    }
-
-    /**
-     * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
-     * @tparam valueT The type of the element to be set.
-     * @param [in] q The queue in which the operation is done.
-     * @param [in] ptr Pointer to the virtual device memory.
-     * @param [in] pitch The pitch size by number of elements, including padding.
-     * @param [in] val The value to be set.
-     * @param [in] x The width of memory region by number of elements.
-     * @param [in] y The height of memory region by number of elements.
-     * @return An event list representing the memset operations.
-     */
-    template <typename valueT>
-    static inline std::vector<sycl::event>
-    dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
-                size_t y)
-    {
-      return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
-                         sycl::range<3>(x, y, 1));
-    }
-
-    static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
-                                                    const void *from_ptr,
-                                                    memcpy_direction dir)
-    {
-      switch (dir)
-      {
-      case memcpy_direction::host_to_host:
-      case memcpy_direction::host_to_device:
-      case memcpy_direction::device_to_host:
-      case memcpy_direction::device_to_device:
-        return dir;
-      case memcpy_direction::automatic:
-      {
-        // table[to_attribute][from_attribute]
-        static const memcpy_direction
-            direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
-                           [static_cast<unsigned>(pointer_access_attribute::end)] =
-                               {{memcpy_direction::host_to_host,
-                                 memcpy_direction::device_to_host,
-                                 memcpy_direction::host_to_host},
-                                {memcpy_direction::host_to_device,
-                                 memcpy_direction::device_to_device,
-                                 memcpy_direction::device_to_device},
-                                {memcpy_direction::host_to_host,
-                                 memcpy_direction::device_to_device,
-                                 memcpy_direction::device_to_device}};
-        return direction_table[static_cast<unsigned>(get_pointer_attribute(
-            q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
-      }
-      default:
-        throw std::runtime_error("dpct_memcpy: invalid direction value");
-      }
-    }
-
-    static sycl::event
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-                memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
-    {
-      if (!size)
-        return sycl::event{};
-#ifdef DPCT_USM_LEVEL_NONE
-      auto &mm = mem_mgr::instance();
-      auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-
-      switch (real_direction)
-      {
-      case host_to_host:
-        return q.submit([&](sycl::handler &cgh)
-                        {
-      cgh.depends_on(dep_events);
-      cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
-      case host_to_device:
-      {
-        auto alloc = mm.translate_ptr(to_ptr);
-        size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-        return q.submit([&](sycl::handler &cgh)
-                        {
-      cgh.depends_on(dep_events);
-      auto r = sycl::range<1>(size);
-      auto o = sycl::id<1>(offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                          sycl::access::target::device>
-          acc(alloc.buffer, cgh, r, o);
-      cgh.copy(from_ptr, acc); });
-      }
-      case device_to_host:
-      {
-        auto alloc = mm.translate_ptr(from_ptr);
-        size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-        return q.submit([&](sycl::handler &cgh)
-                        {
-      cgh.depends_on(dep_events);
-      auto r = sycl::range<1>(size);
-      auto o = sycl::id<1>(offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                          sycl::access::target::device>
-          acc(alloc.buffer, cgh, r, o);
-      cgh.copy(acc, to_ptr); });
-      }
-      case device_to_device:
-      {
-        auto to_alloc = mm.translate_ptr(to_ptr);
-        auto from_alloc = mm.translate_ptr(from_ptr);
-        size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-        size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-        return q.submit([&](sycl::handler &cgh)
-                        {
-      cgh.depends_on(dep_events);
-      auto r = sycl::range<1>(size);
-      auto to_o = sycl::id<1>(to_offset);
-      auto from_o = sycl::id<1>(from_offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                          sycl::access::target::device>
-          to_acc(to_alloc.buffer, cgh, r, to_o);
-      sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                          sycl::access::target::device>
-          from_acc(from_alloc.buffer, cgh, r, from_o);
-      cgh.copy(from_acc, to_acc); });
-      }
-      default:
-        throw std::runtime_error("dpct_memcpy: invalid direction value");
-      }
-#else
-      return q.memcpy(to_ptr, from_ptr, size, dep_events);
-#endif // DPCT_USM_LEVEL_NONE
-    }
-
-    // Get actual copy range and make sure it will not exceed range.
-    static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                        size_t pitch)
-    {
-      return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
-    }
-
-    static inline size_t get_offset(sycl::id<3> id, size_t slice,
-                                    size_t pitch)
-    {
-      return slice * id.get(2) + pitch * id.get(1) + id.get(0);
-    }
-
-    /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
-    /// and \p from_range to another specified by \p to_ptr and \p to_range.
-    static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                sycl::range<3> to_range, sycl::range<3> from_range,
-                sycl::id<3> to_id, sycl::id<3> from_id,
-                sycl::range<3> size, memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
-    {
-      // RAII for host pointer
-      class host_buffer
-      {
-        void *_buf;
-        size_t _size;
-        sycl::queue &_q;
-        const std::vector<sycl::event> &_deps; // free operation depends
-
-      public:
-        host_buffer(size_t size, sycl::queue &q,
-                    const std::vector<sycl::event> &deps)
-            : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-        void *get_ptr() const { return _buf; }
-        size_t get_size() const { return _size; }
-        ~host_buffer()
-        {
-          if (_buf)
-          {
-            _q.submit([&](sycl::handler &cgh)
-                      {
-          cgh.depends_on(_deps);
-          cgh.host_task([buf = _buf] { std::free(buf); }); });
-          }
-        }
-      };
-      std::vector<sycl::event> event_list;
-
-      size_t to_slice = to_range.get(1) * to_range.get(0),
-             from_slice = from_range.get(1) * from_range.get(0);
-      unsigned char *to_surface =
-          (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-      const unsigned char *from_surface =
-          (const unsigned char *)from_ptr +
-          get_offset(from_id, from_slice, from_range.get(0));
-
-      if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
-      {
-        return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                            direction, dep_events)};
-      }
-      direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-      size_t size_slice = size.get(1) * size.get(0);
-      switch (direction)
-      {
-      case host_to_host:
-        for (size_t z = 0; z < size.get(2); ++z)
-        {
-          unsigned char *to_ptr = to_surface;
-          const unsigned char *from_ptr = from_surface;
-          if (to_range.get(0) == from_range.get(0) &&
-              to_range.get(0) == size.get(0))
-          {
-            event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                                             direction, dep_events));
-          }
-          else
-          {
-            for (size_t y = 0; y < size.get(1); ++y)
-            {
-              event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                               direction, dep_events));
-              to_ptr += to_range.get(0);
-              from_ptr += from_range.get(0);
-            }
-          }
-          to_surface += to_slice;
-          from_surface += from_slice;
-        }
-        break;
-      case host_to_device:
-      {
-        host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                        event_list);
-        std::vector<sycl::event> host_events;
-        if (to_slice == size_slice)
-        {
-          // Copy host data to a temp host buffer with the shape of target.
-          host_events =
-              dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                          sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                          host_to_host, dep_events);
-        }
-        else
-        {
-          // Copy host data to a temp host buffer with the shape of target.
-          host_events = dpct_memcpy(
-              q, buf.get_ptr(), from_surface, to_range, from_range,
-              sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
-              // If has padding data, not sure whether it is useless. So fill temp
-              // buffer with it.
-              std::vector<sycl::event>{
-                  dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                              device_to_host, dep_events)});
-        }
-        // Copy from temp host buffer to device with only one submit.
-        event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                                         buf.get_size(), host_to_device,
-                                         host_events));
-        break;
-      }
-      case device_to_host:
-      {
-        host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                        event_list);
-        // Copy from host temp buffer to host target with reshaping.
-        event_list = dpct_memcpy(
-            q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
-            sycl::id<3>(0, 0, 0), size, host_to_host,
-            // Copy from device to temp host buffer with only one submit.
-            std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                                                 buf.get_size(),
-                                                 device_to_host, dep_events)});
-        break;
-      }
-      case device_to_device:
-#ifdef DPCT_USM_LEVEL_NONE
-      {
-        auto &mm = mem_mgr::instance();
-        auto to_alloc = mm.translate_ptr(to_surface);
-        auto from_alloc = mm.translate_ptr(from_surface);
-        size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-        size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-        event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                      {
-      cgh.depends_on(dep_events);
-      auto to_o = sycl::id<1>(to_offset);
-      auto from_o = sycl::id<1>(from_offset);
-      sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                         sycl::access::target::device>
-          to_acc(to_alloc.buffer, cgh,
-                 get_copy_range(size, to_slice, to_range.get(0)), to_o);
-      sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                         sycl::access::target::device>
-          from_acc(from_alloc.buffer, cgh,
-                   get_copy_range(size, from_slice, from_range.get(0)), from_o);
-      cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
-          size,
-          [=](sycl::id<3> id) {
-            to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                from_acc[get_offset(id, from_slice, from_range.get(0))];
-          }); }));
-      }
-#else
-        event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                      {
-      cgh.depends_on(dep_events);
-      cgh.parallel_for<class dpct_memcpy_3d_detail>(
-          size,
-          [=](sycl::id<3> id) {
-            to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                from_surface[get_offset(id, from_slice, from_range.get(0))];
-          }); }));
-#endif
-      break;
-      default:
-        throw std::runtime_error("dpct_memcpy: invalid direction value");
-      }
-      return event_list;
-    }
-
-    /// memcpy 2D/3D matrix specified by pitched_data.
-    static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-                memcpy_direction direction = automatic)
-    {
-      return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                         sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                         sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                         size, direction);
-    }
-
-    /// memcpy 2D matrix with pitch.
-    static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-                memcpy_direction direction = automatic)
-    {
-      return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                         sycl::range<3>(from_pitch, y, 1),
-                         sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                         sycl::range<3>(x, y, 1), direction);
-    }
-
-    namespace deprecated
-    {
-
-      template <typename T, sycl::usm::alloc AllocKind>
-      class usm_allocator
-      {
-      private:
-        using Alloc = sycl::usm_allocator<T, AllocKind>;
-        Alloc _impl;
-
-      public:
-        using value_type = typename std::allocator_traits<Alloc>::value_type;
-        using pointer = typename std::allocator_traits<Alloc>::pointer;
-        using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
-        using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
-        using const_void_pointer =
-            typename std::allocator_traits<Alloc>::const_void_pointer;
-        using reference = typename std::allocator_traits<Alloc>::value_type &;
-        using const_reference =
-            const typename std::allocator_traits<Alloc>::value_type &;
-        using difference_type =
-            typename std::allocator_traits<Alloc>::difference_type;
-        using size_type = typename std::allocator_traits<Alloc>::size_type;
-        using propagate_on_container_copy_assignment = typename std::allocator_traits<
-            Alloc>::propagate_on_container_copy_assignment;
-        using propagate_on_container_move_assignment = typename std::allocator_traits<
-            Alloc>::propagate_on_container_move_assignment;
-        using propagate_on_container_swap =
-            typename std::allocator_traits<Alloc>::propagate_on_container_swap;
-        using is_always_equal =
-            typename std::allocator_traits<Alloc>::is_always_equal;
-
-        template <typename U>
-        struct rebind
-        {
-          typedef usm_allocator<U, AllocKind> other;
-        };
-
-        usm_allocator() : _impl(dpct::get_default_queue()) {}
-        ~usm_allocator() {}
-        usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
-        usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
-        pointer address(reference r) { return &r; }
-        const_pointer address(const_reference r) { return &r; }
-        pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
-        {
-          return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
-        }
-        void deallocate(pointer p, size_type cnt)
-        {
-          std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
-        }
-        size_type max_size() const
-        {
-          return std::allocator_traits<Alloc>::max_size(_impl);
-        }
-        bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
-        bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
-      };
-
-    } // namespace deprecated
-
-    inline void dpct_free(void *ptr,
-                          const sycl::queue &q)
-    {
-      if (ptr)
-      {
-#ifdef DPCT_USM_LEVEL_NONE
-        detail::mem_mgr::instance().mem_free(ptr);
-#else
-        sycl::free(ptr, q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
-      }
-    }
-
-    template <typename T>
-    inline auto get_memory(const void *x)
-    {
-      T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
-#ifdef DPCT_USM_LEVEL_NONE
-      return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
-#else
-      return new_x;
-#endif
-    }
-
-    template <typename T>
-    inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
-    {
-      using Ty = typename DataType<T>::T2;
-      Ty s_h;
-      if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
-        detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
-            .wait();
-      else
-        s_h = *reinterpret_cast<const Ty *>(s);
-      return s_h;
-    }
-
-  } // namespace detail
-
-  template <typename T>
-  inline auto get_value(const T *s, sycl::queue &q)
-  {
-    return detail::get_value(s, q);
-  }
-
-  namespace detail
-  {
-    template <class Ta, class Tb, class Tc, class Ts>
-    inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                          oneapi::mkl::transpose b_trans, int m, int n, int k,
-                          const void *alpha, const void *a, int lda, const void *b,
-                          int ldb, const void *beta, void *c, int ldc)
-    {
-#ifndef __INTEL_MKL__
-      throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                               "Project does not support this API.");
-#else
-      Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-      Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-      auto data_a = get_memory<const Ta>(a);
-      auto data_b = get_memory<const Tb>(b);
-      auto data_c = get_memory<Tc>(c);
-      oneapi::mkl::blas::column_major::gemm(
-          q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-          data_b, ldb, beta_value, data_c, ldc);
-#endif
-    }
-
-    template <typename VecT, class BinaryOperation, class = void>
-    class vectorized_binary
-    {
-    public:
-      inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
-      {
-        VecT v4;
-        for (size_t i = 0; i < v4.size(); ++i)
-        {
-          v4[i] = binary_op(a[i], b[i]);
-        }
-        return v4;
-      }
-    };
-
-    template <typename VecT, class BinaryOperation>
-    class vectorized_binary<
-        VecT, BinaryOperation,
-        std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>>
-    {
-    public:
-      inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
-      {
-        return binary_op(a, b).template as<VecT>();
-      }
-    };
-
-    template <class Ta, class Tb, class Tc, class Ts>
-    inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                                oneapi::mkl::transpose b_trans, int m, int n, int k,
-                                const void *alpha, const void **a, int lda,
-                                const void **b, int ldb, const void *beta, void **c,
-                                int ldc, int batch_size)
-    {
-      struct matrix_info_t
-      {
-        oneapi::mkl::transpose transpose_info[2];
-        Ts value_info[2];
-        std::int64_t size_info[3];
-        std::int64_t ld_info[3];
-        std::int64_t groupsize_info;
-      };
-
-      Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-      Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-
-      matrix_info_t *matrix_info =
-          (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
-      matrix_info->transpose_info[0] = a_trans;
-      matrix_info->transpose_info[1] = b_trans;
-      matrix_info->value_info[0] = alpha_value;
-      matrix_info->value_info[1] = beta_value;
-      matrix_info->size_info[0] = m;
-      matrix_info->size_info[1] = n;
-      matrix_info->size_info[2] = k;
-      matrix_info->ld_info[0] = lda;
-      matrix_info->ld_info[1] = ldb;
-      matrix_info->ld_info[2] = ldc;
-      matrix_info->groupsize_info = batch_size;
-
-      sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
-          q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
-          matrix_info->size_info, matrix_info->size_info + 1,
-          matrix_info->size_info + 2, matrix_info->value_info,
-          reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
-          reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
-          matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
-          matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
-
-      q.submit([&](sycl::handler &cgh)
-               {
-    cgh.depends_on(e);
-    cgh.host_task([=] { std::free(matrix_info); }); });
-    }
-
-    template <class Ta, class Tb, class Tc, class Ts>
-    inline void
-    gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                    oneapi::mkl::transpose b_trans, int m, int n,
-                    int k, const void *alpha, const void *a, int lda,
-                    long long int stride_a, const void *b, int ldb,
-                    long long int stride_b, const void *beta, void *c,
-                    int ldc, long long int stride_c, int batch_size)
-    {
-      Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-      Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-      auto data_a = get_memory<const Ta>(a);
-      auto data_b = get_memory<const Tb>(b);
-      auto data_c = get_memory<Tc>(c);
-      oneapi::mkl::blas::column_major::gemm_batch(
-          q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-          stride_a, data_b, ldb, stride_b, beta_value,
-          data_c, ldc, stride_c, batch_size);
-    }
-
-  } // namespace detail
-
-  template <typename VecT, class BinaryOperation>
-  inline unsigned vectorized_binary(unsigned a, unsigned b,
-                                    const BinaryOperation binary_op)
-  {
-    sycl::vec<unsigned, 1> v0{a}, v1{b};
-    auto v2 = v0.as<VecT>();
-    auto v3 = v1.as<VecT>();
-    auto v4 =
-        detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
-    v0 = v4.template as<sycl::vec<unsigned, 1>>();
-    return v0;
-  }
-
-  static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
-                                memcpy_direction direction = automatic,
-                                sycl::queue &q = dpct::get_default_queue())
-  {
-    detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
-  }
-
-  static inline unsigned int select_device(unsigned int id)
-  {
-    dev_mgr::instance().select_device(id);
-    return id;
-  }
-
-  template <typename T>
-  T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
-                             int logical_sub_group_size = 32)
-  {
-    unsigned int id = g.get_local_linear_id();
-    unsigned int start_index =
-        id / logical_sub_group_size * logical_sub_group_size;
-    unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
-    return sycl::select_from_group(g, x,
-                                   target_offset < logical_sub_group_size
-                                       ? start_index + target_offset
-                                       : id);
-  }
-
-  template <typename T>
-  sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
-  {
-    return sycl::vec<T, 1>(val)
-        .template as<sycl::vec<
-            std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
-        .template convert<T>();
-  }
-
-  template <typename T1, typename T2>
-  using dot_product_acc_t =
-      std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-                         uint32_t, int32_t>;
-
-  template <typename T1, typename T2, typename T3>
-  inline auto dp4a(T1 a, T2 b, T3 c)
-  {
-    dot_product_acc_t<T1, T2> res = c;
-    auto va = extract_and_sign_or_zero_extend4(a);
-    auto vb = extract_and_sign_or_zero_extend4(b);
-    res += va[0] * vb[0];
-    res += va[1] * vb[1];
-    res += va[2] * vb[2];
-    res += va[3] * vb[3];
-    return res;
-  }
-
-  struct sub_sat
-  {
-    template <typename T>
-    auto operator()(const T x, const T y) const
-    {
-      return sycl::sub_sat(x, y);
-    }
-  };
-
-  template <typename S, typename T>
-  inline T vectorized_min(T a, T b)
-  {
-    sycl::vec<T, 1> v0{a}, v1{b};
-    auto v2 = v0.template as<S>();
-    auto v3 = v1.template as<S>();
-    auto v4 = sycl::min(v2, v3);
-    v0 = v4.template as<sycl::vec<T, 1>>();
-    return v0;
-  }
-
-  inline float pow(const float a, const int b) { return sycl::pown(a, b); }
-  inline double pow(const double a, const int b) { return sycl::pown(a, b); }
-  inline float pow(const float a, const float b) { return sycl::pow(a, b); }
-  inline double pow(const double a, const double b) { return sycl::pow(a, b); }
-  template <typename T, typename U>
-  inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
-  pow(const T a, const U b)
-  {
-    return sycl::pow(a, static_cast<T>(b));
-  }
-  template <typename T, typename U>
-  inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
-  pow(const T a, const U b)
-  {
-    return sycl::pow(static_cast<double>(a), static_cast<double>(b));
-  }
-
-  inline double min(const double a, const float b)
-  {
-    return sycl::fmin(a, static_cast<double>(b));
-  }
-  inline double min(const float a, const double b)
-  {
-    return sycl::fmin(static_cast<double>(a), b);
-  }
-  inline float min(const float a, const float b) { return sycl::fmin(a, b); }
-  inline double min(const double a, const double b) { return sycl::fmin(a, b); }
-  inline std::uint32_t min(const std::uint32_t a, const std::int32_t b)
-  {
-    return sycl::min(a, static_cast<std::uint32_t>(b));
-  }
-  inline std::uint32_t min(const std::int32_t a, const std::uint32_t b)
-  {
-    return sycl::min(static_cast<std::uint32_t>(a), b);
-  }
-  inline std::int32_t min(const std::int32_t a, const std::int32_t b)
-  {
-    return sycl::min(a, b);
-  }
-  inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b)
-  {
-    return sycl::min(a, b);
-  }
-  inline std::uint64_t min(const std::uint64_t a, const std::int64_t b)
-  {
-    return sycl::min(a, static_cast<std::uint64_t>(b));
-  }
-  inline std::uint64_t min(const std::int64_t a, const std::uint64_t b)
-  {
-    return sycl::min(static_cast<std::uint64_t>(a), b);
-  }
-  inline std::int64_t min(const std::int64_t a, const std::int64_t b)
-  {
-    return sycl::min(a, b);
-  }
-  inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b)
-  {
-    return sycl::min(a, b);
-  }
-  inline std::uint64_t min(const std::uint64_t a, const std::int32_t b)
-  {
-    return sycl::min(a, static_cast<std::uint64_t>(b));
-  }
-  inline std::uint64_t min(const std::int32_t a, const std::uint64_t b)
-  {
-    return sycl::min(static_cast<std::uint64_t>(a), b);
-  }
-  inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b)
-  {
-    return sycl::min(a, static_cast<std::uint64_t>(b));
-  }
-  inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b)
-  {
-    return sycl::min(static_cast<std::uint64_t>(a), b);
-  }
-  // max function overloads.
-  // For floating-point types, `float` or `double` arguments are acceptable.
-  // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
-  // `std::int64_t` type arguments are acceptable.
-  inline double max(const double a, const float b)
-  {
-    return sycl::fmax(a, static_cast<double>(b));
-  }
-  inline double max(const float a, const double b)
-  {
-    return sycl::fmax(static_cast<double>(a), b);
-  }
-  inline float max(const float a, const float b) { return sycl::fmax(a, b); }
-  inline double max(const double a, const double b) { return sycl::fmax(a, b); }
-  inline std::uint32_t max(const std::uint32_t a, const std::int32_t b)
-  {
-    return sycl::max(a, static_cast<std::uint32_t>(b));
-  }
-  inline std::uint32_t max(const std::int32_t a, const std::uint32_t b)
-  {
-    return sycl::max(static_cast<std::uint32_t>(a), b);
-  }
-  inline std::int32_t max(const std::int32_t a, const std::int32_t b)
-  {
-    return sycl::max(a, b);
-  }
-  inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b)
-  {
-    return sycl::max(a, b);
-  }
-  inline std::uint64_t max(const std::uint64_t a, const std::int64_t b)
-  {
-    return sycl::max(a, static_cast<std::uint64_t>(b));
-  }
-  inline std::uint64_t max(const std::int64_t a, const std::uint64_t b)
-  {
-    return sycl::max(static_cast<std::uint64_t>(a), b);
-  }
-  inline std::int64_t max(const std::int64_t a, const std::int64_t b)
-  {
-    return sycl::max(a, b);
-  }
-  inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b)
-  {
-    return sycl::max(a, b);
-  }
-  inline std::uint64_t max(const std::uint64_t a, const std::int32_t b)
-  {
-    return sycl::max(a, static_cast<std::uint64_t>(b));
-  }
-  inline std::uint64_t max(const std::int32_t a, const std::uint64_t b)
-  {
-    return sycl::max(static_cast<std::uint64_t>(a), b);
-  }
-  inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b)
-  {
-    return sycl::max(a, static_cast<std::uint64_t>(b));
-  }
-  inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b)
-  {
-    return sycl::max(static_cast<std::uint64_t>(a), b);
-  }
-
-  inline void
-  has_capability_or_fail(const sycl::device &dev,
-                         const std::initializer_list<sycl::aspect> &props)
-  {
-    for (const auto &it : props)
-    {
-      if (dev.has(it))
-        continue;
-      switch (it)
-      {
-      case sycl::aspect::fp64:
-        throw std::runtime_error("'double' is not supported in '" +
-                                 dev.get_info<sycl::info::device::name>() +
-                                 "' device");
-        break;
-      case sycl::aspect::fp16:
-        throw std::runtime_error("'half' is not supported in '" +
-                                 dev.get_info<sycl::info::device::name>() +
-                                 "' device");
-        break;
-      default:
-#define __SYCL_ASPECT(ASPECT, ID) \
-  case sycl::aspect::ASPECT:      \
-    return #ASPECT;
-#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
-#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
-        auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
-        {
-          switch (AspectNum)
-          {
-#include <sycl/info/aspects.def>
-#include <sycl/info/aspects_deprecated.def>
-          default:
-            return "unknown aspect";
-          }
-        };
-#undef __SYCL_ASPECT_DEPRECATED_ALIAS
-#undef __SYCL_ASPECT_DEPRECATED
-#undef __SYCL_ASPECT
-        throw std::runtime_error(
-            "'" + getAspectNameStr(it) + "' is not supported in '" +
-            dev.get_info<sycl::info::device::name>() + "' device");
-      }
-      break;
-    }
-  }
-
-  static inline unsigned int get_current_device_id()
-  {
-    return dev_mgr::instance().current_device_id();
-  }
-
-  static inline device_ext &get_current_device()
-  {
-    return dev_mgr::instance().current_device();
-  }
-
-  static inline sycl::queue &get_in_order_queue()
-  {
-    return dev_mgr::instance().current_device().in_order_queue();
-  }
-
-  static sycl::event
-  dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-              memcpy_direction direction,
-              const std::vector<sycl::event> &dep_events = {})
-  {
-    if (!size)
-      return sycl::event{};
-#ifdef DPCT_USM_LEVEL_NONE
-    auto &mm = mem_mgr::instance();
-    auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-
-    switch (real_direction)
-    {
-    case host_to_host:
-      return q.submit([&](sycl::handler &cgh)
-                      {
-          cgh.depends_on(dep_events);
-          cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
-    case host_to_device:
-    {
-      auto alloc = mm.translate_ptr(to_ptr);
-      size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-      return q.submit([&](sycl::handler &cgh)
-                      {
-          cgh.depends_on(dep_events);
-          auto r = sycl::range<1>(size);
-          auto o = sycl::id<1>(offset);
-          sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                              sycl::access::target::device>
-              acc(alloc.buffer, cgh, r, o);
-          cgh.copy(from_ptr, acc); });
-    }
-    case device_to_host:
-    {
-      auto alloc = mm.translate_ptr(from_ptr);
-      size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-      return q.submit([&](sycl::handler &cgh)
-                      {
-          cgh.depends_on(dep_events);
-          auto r = sycl::range<1>(size);
-          auto o = sycl::id<1>(offset);
-          sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                              sycl::access::target::device>
-              acc(alloc.buffer, cgh, r, o);
-          cgh.copy(acc, to_ptr); });
-    }
-    case device_to_device:
-    {
-      auto to_alloc = mm.translate_ptr(to_ptr);
-      auto from_alloc = mm.translate_ptr(from_ptr);
-      size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-      size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-      return q.submit([&](sycl::handler &cgh)
-                      {
-          cgh.depends_on(dep_events);
-          auto r = sycl::range<1>(size);
-          auto to_o = sycl::id<1>(to_offset);
-          auto from_o = sycl::id<1>(from_offset);
-          sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                              sycl::access::target::device>
-              to_acc(to_alloc.buffer, cgh, r, to_o);
-          sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                              sycl::access::target::device>
-              from_acc(from_alloc.buffer, cgh, r, from_o);
-          cgh.copy(from_acc, to_acc); });
-    }
-    default:
-      throw std::runtime_error("dpct_memcpy: invalid direction value");
-    }
-#else
-    return q.memcpy(to_ptr, from_ptr, size, dep_events);
-#endif // DPCT_USM_LEVEL_NONE
-  }
-
-  // Get actual copy range and make sure it will not exceed range.
-  static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                      size_t pitch)
-  {
-    return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
-  }
-
-  static inline size_t get_offset(sycl::id<3> id, size_t slice,
-                                  size_t pitch)
-  {
-    return slice * id.get(2) + pitch * id.get(1) + id.get(0);
-  }
-
-  /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
-  /// and \p from_range to another specified by \p to_ptr and \p to_range.
-  static inline std::vector<sycl::event>
-  dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-              sycl::range<3> to_range, sycl::range<3> from_range,
-              sycl::id<3> to_id, sycl::id<3> from_id,
-              sycl::range<3> size, memcpy_direction direction,
-              const std::vector<sycl::event> &dep_events = {})
-  {
-    // RAII for host pointer
-    class host_buffer
-    {
-      void *_buf;
-      size_t _size;
-      sycl::queue &_q;
-      const std::vector<sycl::event> &_deps; // free operation depends
-
-    public:
-      host_buffer(size_t size, sycl::queue &q,
-                  const std::vector<sycl::event> &deps)
-          : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-      void *get_ptr() const { return _buf; }
-      size_t get_size() const { return _size; }
-      ~host_buffer()
-      {
-        if (_buf)
-        {
-          _q.submit([&](sycl::handler &cgh)
-                    {
-              cgh.depends_on(_deps);
-              cgh.host_task([buf = _buf] { std::free(buf); }); });
-        }
-      }
-    };
-    std::vector<sycl::event> event_list;
-
-    size_t to_slice = to_range.get(1) * to_range.get(0),
-           from_slice = from_range.get(1) * from_range.get(0);
-    unsigned char *to_surface =
-        (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-    const unsigned char *from_surface =
-        (const unsigned char *)from_ptr +
-        get_offset(from_id, from_slice, from_range.get(0));
-
-    if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
-    {
-      return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                          direction, dep_events)};
-    }
-    direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-    size_t size_slice = size.get(1) * size.get(0);
-    switch (direction)
-    {
-    case host_to_host:
-      for (size_t z = 0; z < size.get(2); ++z)
-      {
-        unsigned char *to_ptr = to_surface;
-        const unsigned char *from_ptr = from_surface;
-        if (to_range.get(0) == from_range.get(0) &&
-            to_range.get(0) == size.get(0))
-        {
-          event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                                           direction, dep_events));
-        }
-        else
-        {
-          for (size_t y = 0; y < size.get(1); ++y)
-          {
-            event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                             direction, dep_events));
-            to_ptr += to_range.get(0);
-            from_ptr += from_range.get(0);
-          }
-        }
-        to_surface += to_slice;
-        from_surface += from_slice;
-      }
-      break;
-    case host_to_device:
-    {
-      host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                      event_list);
-      std::vector<sycl::event> host_events;
-      if (to_slice == size_slice)
-      {
-        // Copy host data to a temp host buffer with the shape of target.
-        host_events =
-            dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                        sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                        host_to_host, dep_events);
-      }
-      else
-      {
-        // Copy host data to a temp host buffer with the shape of target.
-        host_events = dpct_memcpy(
-            q, buf.get_ptr(), from_surface, to_range, from_range,
-            sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
-            // If has padding data, not sure whether it is useless. So fill temp
-            // buffer with it.
-            std::vector<sycl::event>{
-                dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                            device_to_host, dep_events)});
-      }
-      // Copy from temp host buffer to device with only one submit.
-      event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                                       buf.get_size(), host_to_device,
-                                       host_events));
-      break;
-    }
-    case device_to_host:
-    {
-      host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                      event_list);
-      // Copy from host temp buffer to host target with reshaping.
-      event_list = dpct_memcpy(
-          q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
-          sycl::id<3>(0, 0, 0), size, host_to_host,
-          // Copy from device to temp host buffer with only one submit.
-          std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                                               buf.get_size(),
-                                               device_to_host, dep_events)});
-      break;
-    }
-    case device_to_device:
-#ifdef DPCT_USM_LEVEL_NONE
-    {
-      auto &mm = mem_mgr::instance();
-      auto to_alloc = mm.translate_ptr(to_surface);
-      auto from_alloc = mm.translate_ptr(from_surface);
-      size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-      size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-      event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                    {
-          cgh.depends_on(dep_events);
-          auto to_o = sycl::id<1>(to_offset);
-          auto from_o = sycl::id<1>(from_offset);
-          sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                             sycl::access::target::device>
-              to_acc(to_alloc.buffer, cgh,
-                     get_copy_range(size, to_slice, to_range.get(0)), to_o);
-          sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                             sycl::access::target::device>
-              from_acc(from_alloc.buffer, cgh,
-                       get_copy_range(size, from_slice, from_range.get(0)), from_o);
-          cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
-              size,
-              [=](sycl::id<3> id) {
-                to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                    from_acc[get_offset(id, from_slice, from_range.get(0))];
-              }); }));
-    }
-#else
-      event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                    {
-          cgh.depends_on(dep_events);
-          cgh.parallel_for<class dpct_memcpy_3d_detail>(
-              size,
-              [=](sycl::id<3> id) {
-                to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                    from_surface[get_offset(id, from_slice, from_range.get(0))];
-              }); }));
-#endif
-    break;
-    default:
-      throw std::runtime_error("dpct_memcpy: invalid direction value");
-    }
-    return event_list;
-  }
-
-  /// memcpy 2D/3D matrix specified by pitched_data.
-  static inline std::vector<sycl::event>
-  dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-              pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-              memcpy_direction direction = automatic)
-  {
-    return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                       sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                       sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                       size, direction);
-  }
-
-  /// memcpy 2D matrix with pitch.
-  static inline std::vector<sycl::event>
-  dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-              size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-              memcpy_direction direction = automatic)
-  {
-    return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                       sycl::range<3>(from_pitch, y, 1),
-                       sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                       sycl::range<3>(x, y, 1), direction);
-  }
-
-  inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                   oneapi::mkl::transpose b_trans, int m, int n, int k,
-                   const void *alpha, const void *a, library_data_t a_type,
-                   int lda, const void *b, library_data_t b_type, int ldb,
-                   const void *beta, void *c, library_data_t c_type, int ldc,
-                   library_data_t scaling_type)
-  {
-    bool matched = false;
-    if (scaling_type == library_data_t::real_float &&
-        c_type == library_data_t::complex_float)
-    {
-      scaling_type = library_data_t::complex_float;
-    }
-    else if (scaling_type == library_data_t::real_double &&
-             c_type == library_data_t::complex_double)
-    {
-      scaling_type = library_data_t::complex_double;
-    }
-
-    std::uint64_t key =
-        detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-    switch (key)
-    {
-    case detail::get_type_combination_id(
-        library_data_t::real_float, library_data_t::real_float,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_impl<float, float, float, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_double, library_data_t::real_double,
-        library_data_t::real_double, library_data_t::real_double):
-    {
-      detail::gemm_impl<double, double, double, double>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::complex_float, library_data_t::complex_float,
-        library_data_t::complex_float, library_data_t::complex_float):
-    {
-      detail::gemm_impl<std::complex<float>, std::complex<float>,
-                        std::complex<float>, std::complex<float>>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::complex_double, library_data_t::complex_double,
-        library_data_t::complex_double, library_data_t::complex_double):
-    {
-      detail::gemm_impl<std::complex<double>, std::complex<double>,
-                        std::complex<double>, std::complex<double>>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_half):
-    {
-      detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                        sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
-                                    lda, b, ldb, beta, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                        float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
-                               ldb, beta, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_impl<sycl::half, sycl::half, float, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_float):
-    {
-      float alpha_value =
-          dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-      float beta_value =
-          dpct::get_value(reinterpret_cast<const float *>(beta), q);
-      sycl::half alpha_half(alpha_value);
-      sycl::half beta_half(beta_value);
-      detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                        sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
-                                    a, lda, b, ldb, &beta_half, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_bfloat16, library_data_t::real_float):
-    {
-      detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                        oneapi::mkl::bfloat16, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_int32, library_data_t::real_int32):
-    {
-      float alpha_float =
-          dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-      float beta_float =
-          dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-      detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
-          q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
-      break;
-    }
-    default:
-      throw std::runtime_error("the combination of data type is unsupported");
-    }
-  } // gemm()
-
-  /// Computes a batch of matrix-matrix product with general matrices.
-  /// \param [in] q The queue where the routine should be executed.
-  /// \param [in] a_trans Specifies the operation applied to A.
-  /// \param [in] b_trans Specifies the operation applied to B.
-  /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-  /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-  /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-  /// \param [in] alpha Scaling factor for the matrix-matrix product.
-  /// \param [in] a Input matrix A.
-  /// \param [in] a_type Data type of the matrix A.
-  /// \param [in] lda Leading dimension of A.
-  /// \param [in] b Input matrix B.
-  /// \param [in] b_type Data type of the matrix B.
-  /// \param [in] ldb Leading dimension of B.
-  /// \param [in] beta Scaling factor for matrix C.
-  /// \param [in, out] c Input/Output matrix C.
-  /// \param [in] c_type Data type of the matrix C.
-  /// \param [in] ldc Leading dimension of C.
-  /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-  /// \param [in] scaling_type Data type of the scaling factors.
-  inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                         oneapi::mkl::transpose b_trans, int m, int n, int k,
-                         const void *alpha, const void *a[],
-                         library_data_t a_type, int lda, const void *b[],
-                         library_data_t b_type, int ldb, const void *beta,
-                         void *c[], library_data_t c_type, int ldc,
-                         int batch_size, library_data_t scaling_type)
-  {
-#ifdef DPCT_USM_LEVEL_NONE
-    throw std::runtime_error("this API is unsupported when USM level is none");
-#else
-    bool matched = false;
-    if (scaling_type == library_data_t::real_float &&
-        c_type == library_data_t::complex_float)
-    {
-      scaling_type = library_data_t::complex_float;
-    }
-    else if (scaling_type == library_data_t::real_double &&
-             c_type == library_data_t::complex_double)
-    {
-      scaling_type = library_data_t::complex_double;
-    }
-
-    std::uint64_t key =
-        detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-    switch (key)
-    {
-    case detail::get_type_combination_id(
-        library_data_t::real_float, library_data_t::real_float,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<float, float, float, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-          batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_double, library_data_t::real_double,
-        library_data_t::real_double, library_data_t::real_double):
-    {
-      detail::gemm_batch_impl<double, double, double, double>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-          batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::complex_float, library_data_t::complex_float,
-        library_data_t::complex_float, library_data_t::complex_float):
-    {
-      detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                              std::complex<float>, std::complex<float>>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-          batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::complex_double, library_data_t::complex_double,
-        library_data_t::complex_double, library_data_t::complex_double):
-    {
-      detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                              std::complex<double>, std::complex<double>>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-          batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_half):
-    {
-      detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                          a, lda, b, ldb, beta, c, ldc,
-                                          batch_size);
-      break;
-    }
-#ifdef __INTEL_MKL__
-    case detail::get_type_combination_id(
-        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_bfloat16, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                              oneapi::mkl::bfloat16, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-          batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                     b, ldb, beta, c, ldc, batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_int32, library_data_t::real_int32):
-    {
-      float alpha_float =
-          dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-      float beta_float =
-          dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-      detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                              float>(q, a_trans, b_trans, m, n, k, &alpha_float,
-                                     a, lda, b, ldb, &beta_float, c, ldc,
-                                     batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-          batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-          batch_size);
-      break;
-    }
-#endif
-    case detail::get_type_combination_id(
-        library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_float):
-    {
-      float alpha_value =
-          dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-      float beta_value =
-          dpct::get_value(reinterpret_cast<const float *>(beta), q);
-      sycl::half alpha_half(alpha_value);
-      sycl::half beta_half(beta_value);
-      detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-          q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
-          batch_size);
-      break;
-    }
-    default:
-      throw std::runtime_error("the combination of data type is unsupported");
-    }
-#endif
-  }
-
-  /// Computes a batch of matrix-matrix product with general matrices.
-  /// \param [in] q The queue where the routine should be executed.
-  /// \param [in] a_trans Specifies the operation applied to A.
-  /// \param [in] b_trans Specifies the operation applied to B.
-  /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-  /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-  /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-  /// \param [in] alpha Scaling factor for the matrix-matrix product.
-  /// \param [in] a Input matrix A.
-  /// \param [in] a_type Data type of the matrix A.
-  /// \param [in] lda Leading dimension of A.
-  /// \param [in] stride_a Stride between the different A matrices.
-  /// \param [in] b Input matrix B.
-  /// \param [in] b_type Data type of the matrix B.
-  /// \param [in] ldb Leading dimension of B.
-  /// \param [in] stride_b Stride between the different B matrices.
-  /// \param [in] beta Scaling factor for matrix C.
-  /// \param [in, out] c Input/Output matrix C.
-  /// \param [in] c_type Data type of the matrix C.
-  /// \param [in] ldc Leading dimension of C.
-  /// \param [in] stride_c Stride between the different C matrices.
-  /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-  /// \param [in] scaling_type Data type of the scaling factors.
-  inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                         oneapi::mkl::transpose b_trans, int m, int n, int k,
-                         const void *alpha, const void *a, library_data_t a_type,
-                         int lda, long long int stride_a, const void *b,
-                         library_data_t b_type, int ldb, long long int stride_b,
-                         const void *beta, void *c, library_data_t c_type,
-                         int ldc, long long int stride_c, int batch_size,
-                         library_data_t scaling_type)
-  {
-    bool matched = false;
-    if (scaling_type == library_data_t::real_float &&
-        c_type == library_data_t::complex_float)
-    {
-      scaling_type = library_data_t::complex_float;
-    }
-    else if (scaling_type == library_data_t::real_double &&
-             c_type == library_data_t::complex_double)
-    {
-      scaling_type = library_data_t::complex_double;
-    }
-
-    std::uint64_t key =
-        detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-    switch (key)
-    {
-    case detail::get_type_combination_id(
-        library_data_t::real_float, library_data_t::real_float,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<float, float, float, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-          beta, c, ldc, stride_c, batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_double, library_data_t::real_double,
-        library_data_t::real_double, library_data_t::real_double):
-    {
-      detail::gemm_batch_impl<double, double, double, double>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-          beta, c, ldc, stride_c, batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::complex_float, library_data_t::complex_float,
-        library_data_t::complex_float, library_data_t::complex_float):
-    {
-      detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                              std::complex<float>, std::complex<float>>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-          beta, c, ldc, stride_c, batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::complex_double, library_data_t::complex_double,
-        library_data_t::complex_double, library_data_t::complex_double):
-    {
-      detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                              std::complex<double>, std::complex<double>>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-          beta, c, ldc, stride_c, batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_half):
-    {
-      detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                          a, lda, stride_a, b, ldb, stride_b,
-                                          beta, c, ldc, stride_c, batch_size);
-      break;
-    }
-#ifdef __INTEL_MKL__
-    case detail::get_type_combination_id(
-        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_bfloat16, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                              oneapi::mkl::bfloat16, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-          beta, c, ldc, stride_c, batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                     stride_a, b, ldb, stride_b, beta, c, ldc,
-                                     stride_c, batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_int32, library_data_t::real_int32):
-    {
-      detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                              std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
-                                            a, lda, stride_a, b, ldb, stride_b,
-                                            beta, c, ldc, stride_c, batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_int8, library_data_t::real_int8,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-          beta, c, ldc, stride_c, batch_size);
-      break;
-    }
-    case detail::get_type_combination_id(
-        library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_float, library_data_t::real_float):
-    {
-      detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-          q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-          beta, c, ldc, stride_c, batch_size);
-      break;
-    }
-#endif
-    case detail::get_type_combination_id(
-        library_data_t::real_half, library_data_t::real_half,
-        library_data_t::real_half, library_data_t::real_float):
-    {
-      float alpha_value =
-          dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-      float beta_value =
-          dpct::get_value(reinterpret_cast<const float *>(beta), q);
-      sycl::half alpha_half(alpha_value);
-      sycl::half beta_half(beta_value);
-      detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-          q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
-          &beta_half, c, ldc, stride_c, batch_size);
-      break;
-    }
-    default:
-      throw std::runtime_error("the combination of data type is unsupported");
-    }
-  }
-
-  static inline void
-  async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
-                    size_t from_pitch, size_t x, size_t y,
-                    memcpy_direction direction = automatic,
-                    sycl::queue &q = get_default_queue())
-  {
-    detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
-                        direction);
-  }
-
-  using err0 = detail::generic_error_type<struct err0_tag, int>;
-  using err1 = detail::generic_error_type<struct err1_tag, int>;
-
-} // COPY from DPCT head files
\ No newline at end of file
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 97398a209d1c6..ba0c67d4398e8 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -19,10 +19,7 @@
 
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
-// #include <dpct/dpct.hpp>
-// #include <dpct/blas_utils.hpp>
-// #include <dpct/lib_common_utils.hpp>
-#include "dpct.hpp"
+
 #include "ggml-sycl.h"
 #include "ggml.h"
 #include "ggml-backend-impl.h"
diff --git a/ggml-sycl.h b/ggml-sycl.h
index 02b4ab258c3bd..e4379a9876845 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -55,3 +55,2810 @@ void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cn
 #ifdef  __cplusplus
 }
 #endif
+
+
+/*
+Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
+*/
+#ifdef __cplusplus
+
+#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#include <map>
+
+#if defined(__linux__)
+#include <sys/mman.h>
+#elif defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#error "Only support Windows and Linux."
+#endif
+
+#if defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#endif
+#if defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#define DPCT_COMPATIBILITY_TEMP (900)
+
+#if defined(_MSC_VER)
+#define __dpct_align__(n) __declspec(align(n))
+#define __dpct_inline__ __forceinline
+#else
+#define __dpct_align__(n) __attribute__((aligned(n)))
+#define __dpct_inline__ __inline__ __attribute__((always_inline))
+#endif
+
+#if defined(_MSC_VER)
+#define __dpct_noinline__ __declspec(noinline)
+#else
+#define __dpct_noinline__ __attribute__((noinline))
+#endif
+
+namespace dpct
+{
+    typedef sycl::queue *queue_ptr;
+    typedef sycl::event *event_ptr;
+    typedef char *device_ptr;
+    typedef uint8_t byte_t;
+    typedef sycl::buffer<byte_t> buffer_t;
+
+    /// SYCL default exception handler
+    inline auto exception_handler = [](sycl::exception_list exceptions)
+    {
+        for (std::exception_ptr const &e : exceptions)
+        {
+            try
+            {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e)
+            {
+                std::cerr << "Caught asynchronous SYCL exception:" << std::endl
+                          << e.what() << std::endl
+                          << "Exception caught at file:" << __FILE__
+                          << ", line:" << __LINE__ << std::endl;
+            }
+        }
+    };
+
+    enum error_code
+    {
+        success = 0,
+        default_error = 999
+    };
+
+    enum memcpy_direction
+    {
+        host_to_host,
+        host_to_device,
+        device_to_host,
+        device_to_device,
+        automatic
+    };
+
+    enum memory_region
+    {
+        global = 0, // device global memory
+        constant,   // device constant memory
+        local,      // device local memory
+        shared,     // memory which can be accessed by host and device
+    };
+
+    enum class library_data_t : unsigned char
+    {
+        real_float = 0,
+        complex_float,
+        real_double,
+        complex_double,
+        real_half,
+        complex_half,
+        real_bfloat16,
+        complex_bfloat16,
+        real_int4,
+        complex_int4,
+        real_uint4,
+        complex_uint4,
+        real_int8,
+        complex_int8,
+        real_uint8,
+        complex_uint8,
+        real_int16,
+        complex_int16,
+        real_uint16,
+        complex_uint16,
+        real_int32,
+        complex_int32,
+        real_uint32,
+        complex_uint32,
+        real_int64,
+        complex_int64,
+        real_uint64,
+        complex_uint64,
+        real_int8_4,
+        real_int8_32,
+        real_uint8_4,
+        library_data_t_size
+    };
+
+    template <typename T>
+    struct DataType
+    {
+        using T2 = T;
+    };
+    template <typename T>
+    struct DataType<sycl::vec<T, 2>>
+    {
+        using T2 = std::complex<T>;
+    };
+
+    static void destroy_event(event_ptr event)
+    {
+        delete event;
+    }
+
+    static inline unsigned int get_tid()
+    {
+#if defined(__linux__)
+        return syscall(SYS_gettid);
+#elif defined(_WIN64)
+        return GetCurrentThreadId();
+#else
+#error "Only support Windows and Linux."
+#endif
+    }
+
+    namespace detail
+    {
+        static void get_version(const sycl::device &dev, int &major, int &minor)
+        {
+            // Version string has the following format:
+            // a. OpenCL<space><major.minor><space><vendor-specific-information>
+            // b. <major.minor>
+            std::string ver;
+            ver = dev.get_info<sycl::info::device::version>();
+            std::string::size_type i = 0;
+            while (i < ver.size())
+            {
+                if (isdigit(ver[i]))
+                    break;
+                i++;
+            }
+            major = std::stoi(&(ver[i]));
+            while (i < ver.size())
+            {
+                if (ver[i] == '.')
+                    break;
+                i++;
+            }
+            i++;
+            minor = std::stoi(&(ver[i]));
+        }
+
+        template <typename tag, typename T>
+        class generic_error_type
+        {
+        public:
+            generic_error_type() = default;
+            generic_error_type(T value) : value{value} {}
+            operator T() const { return value; }
+
+        private:
+            T value;
+        };
+
+    } // namespace detail
+
+    /// Pitched 2D/3D memory data.
+    class pitched_data
+    {
+    public:
+        pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
+        pitched_data(void *data, size_t pitch, size_t x, size_t y)
+            : _data(data), _pitch(pitch), _x(x), _y(y) {}
+
+        void *get_data_ptr() { return _data; }
+        void set_data_ptr(void *data) { _data = data; }
+
+        size_t get_pitch() { return _pitch; }
+        void set_pitch(size_t pitch) { _pitch = pitch; }
+
+        size_t get_x() { return _x; }
+        void set_x(size_t x) { _x = x; };
+
+        size_t get_y() { return _y; }
+        void set_y(size_t y) { _y = y; }
+
+    private:
+        void *_data;
+        size_t _pitch, _x, _y;
+    };
+
+    class device_info
+    {
+    public:
+        // get interface
+        const char *get_name() const { return _name; }
+        char *get_name() { return _name; }
+        template <typename WorkItemSizesTy = sycl::range<3>,
+                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                       std::is_same_v<WorkItemSizesTy, int *>,
+                                   int> = 0>
+        auto get_max_work_item_sizes() const
+        {
+            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+                return sycl::range<3>(_max_work_item_sizes_i[0],
+                                      _max_work_item_sizes_i[1],
+                                      _max_work_item_sizes_i[2]);
+            else
+            {
+                return _max_work_item_sizes_i;
+            }
+        }
+        template <typename WorkItemSizesTy = sycl::range<3>,
+                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                       std::is_same_v<WorkItemSizesTy, int *>,
+                                   int> = 0>
+        auto get_max_work_item_sizes()
+        {
+            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+                return sycl::range<3>(_max_work_item_sizes_i[0],
+                                      _max_work_item_sizes_i[1],
+                                      _max_work_item_sizes_i[2]);
+            else
+            {
+                return _max_work_item_sizes_i;
+            }
+        }
+        bool get_host_unified_memory() const { return _host_unified_memory; }
+        int get_major_version() const { return _major; }
+        int get_minor_version() const { return _minor; }
+        int get_integrated() const { return _integrated; }
+        int get_max_clock_frequency() const { return _frequency; }
+        int get_max_compute_units() const { return _max_compute_units; }
+        int get_max_work_group_size() const { return _max_work_group_size; }
+        int get_max_sub_group_size() const { return _max_sub_group_size; }
+        int get_max_work_items_per_compute_unit() const
+        {
+            return _max_work_items_per_compute_unit;
+        }
+        int get_max_register_size_per_work_group() const
+        {
+            return _max_register_size_per_work_group;
+        }
+        template <typename NDRangeSizeTy = size_t *,
+                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                       std::is_same_v<NDRangeSizeTy, int *>,
+                                   int> = 0>
+        auto get_max_nd_range_size() const
+        {
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+                return _max_nd_range_size;
+            else
+                return _max_nd_range_size_i;
+        }
+        template <typename NDRangeSizeTy = size_t *,
+                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                       std::is_same_v<NDRangeSizeTy, int *>,
+                                   int> = 0>
+        auto get_max_nd_range_size()
+        {
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+                return _max_nd_range_size;
+            else
+                return _max_nd_range_size_i;
+        }
+        size_t get_global_mem_size() const { return _global_mem_size; }
+        size_t get_local_mem_size() const { return _local_mem_size; }
+        /// Returns the maximum clock rate of device's global memory in kHz. If
+        /// compiler does not support this API then returns default value 3200000 kHz.
+        unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
+        /// Returns the maximum bus width between device and memory in bits. If
+        /// compiler does not support this API then returns default value 64 bits.
+        unsigned int get_memory_bus_width() const { return _memory_bus_width; }
+        uint32_t get_device_id() const { return _device_id; }
+        std::array<unsigned char, 16> get_uuid() const { return _uuid; }
+        /// Returns global memory cache size in bytes.
+        unsigned int get_global_mem_cache_size() const
+        {
+            return _global_mem_cache_size;
+        }
+
+        // set interface
+        void set_name(const char *name)
+        {
+            size_t length = strlen(name);
+            if (length < 256)
+            {
+                std::memcpy(_name, name, length + 1);
+            }
+            else
+            {
+                std::memcpy(_name, name, 255);
+                _name[255] = '\0';
+            }
+        }
+        void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes)
+        {
+            for (int i = 0; i < 3; ++i)
+                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+        }
+        [[deprecated]] void
+        set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
+        {
+            for (int i = 0; i < 3; ++i)
+            {
+                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+            }
+        }
+        void set_host_unified_memory(bool host_unified_memory)
+        {
+            _host_unified_memory = host_unified_memory;
+        }
+        void set_major_version(int major) { _major = major; }
+        void set_minor_version(int minor) { _minor = minor; }
+        void set_integrated(int integrated) { _integrated = integrated; }
+        void set_max_clock_frequency(int frequency) { _frequency = frequency; }
+        void set_max_compute_units(int max_compute_units)
+        {
+            _max_compute_units = max_compute_units;
+        }
+        void set_global_mem_size(size_t global_mem_size)
+        {
+            _global_mem_size = global_mem_size;
+        }
+        void set_local_mem_size(size_t local_mem_size)
+        {
+            _local_mem_size = local_mem_size;
+        }
+        void set_max_work_group_size(int max_work_group_size)
+        {
+            _max_work_group_size = max_work_group_size;
+        }
+        void set_max_sub_group_size(int max_sub_group_size)
+        {
+            _max_sub_group_size = max_sub_group_size;
+        }
+        void
+        set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
+        {
+            _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
+        }
+        void set_max_nd_range_size(int max_nd_range_size[])
+        {
+            for (int i = 0; i < 3; i++)
+            {
+                _max_nd_range_size[i] = max_nd_range_size[i];
+                _max_nd_range_size_i[i] = max_nd_range_size[i];
+            }
+        }
+        void set_memory_clock_rate(unsigned int memory_clock_rate)
+        {
+            _memory_clock_rate = memory_clock_rate;
+        }
+        void set_memory_bus_width(unsigned int memory_bus_width)
+        {
+            _memory_bus_width = memory_bus_width;
+        }
+        void
+        set_max_register_size_per_work_group(int max_register_size_per_work_group)
+        {
+            _max_register_size_per_work_group = max_register_size_per_work_group;
+        }
+        void set_device_id(uint32_t device_id)
+        {
+            _device_id = device_id;
+        }
+        void set_uuid(std::array<unsigned char, 16> uuid)
+        {
+            _uuid = std::move(uuid);
+        }
+        void set_global_mem_cache_size(unsigned int global_mem_cache_size)
+        {
+            _global_mem_cache_size = global_mem_cache_size;
+        }
+
+    private:
+        char _name[256];
+        int _max_work_item_sizes_i[3];
+        bool _host_unified_memory = false;
+        int _major;
+        int _minor;
+        int _integrated = 0;
+        int _frequency;
+        // Set estimated value 3200000 kHz as default value.
+        unsigned int _memory_clock_rate = 3200000;
+        // Set estimated value 64 bits as default value.
+        unsigned int _memory_bus_width = 64;
+        unsigned int _global_mem_cache_size;
+        int _max_compute_units;
+        int _max_work_group_size;
+        int _max_sub_group_size;
+        int _max_work_items_per_compute_unit;
+        int _max_register_size_per_work_group;
+        size_t _global_mem_size;
+        size_t _local_mem_size;
+        size_t _max_nd_range_size[3];
+        int _max_nd_range_size_i[3];
+        uint32_t _device_id;
+        std::array<unsigned char, 16> _uuid;
+    };
+
+    static int get_major_version(const sycl::device &dev)
+    {
+        int major, minor;
+        detail::get_version(dev, major, minor);
+        return major;
+    }
+
+    static int get_minor_version(const sycl::device &dev)
+    {
+        int major, minor;
+        detail::get_version(dev, major, minor);
+        return minor;
+    }
+
+    static void get_device_info(device_info &out, const sycl::device &dev)
+    {
+        device_info prop;
+        prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
+
+        int major, minor;
+        detail::get_version(dev, major, minor);
+        prop.set_major_version(major);
+        prop.set_minor_version(minor);
+
+        prop.set_max_work_item_sizes(
+#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
+            // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
+            // is an enum class element
+            dev.get_info<sycl::info::device::max_work_item_sizes>());
+#else
+            // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
+            // an int
+            dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
+#endif
+        prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
+
+        prop.set_max_clock_frequency(
+            dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
+
+        prop.set_max_compute_units(
+            dev.get_info<sycl::info::device::max_compute_units>());
+        prop.set_max_work_group_size(
+            dev.get_info<sycl::info::device::max_work_group_size>());
+        prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
+        prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
+
+#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
+        if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
+        {
+            unsigned int tmp =
+                dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
+            if (tmp != 0)
+                prop.set_memory_clock_rate(1000 * tmp);
+        }
+        if (dev.has(sycl::aspect::ext_intel_memory_bus_width))
+        {
+            prop.set_memory_bus_width(
+                dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
+        }
+        if (dev.has(sycl::aspect::ext_intel_device_id))
+        {
+            prop.set_device_id(
+                dev.get_info<sycl::ext::intel::info::device::device_id>());
+        }
+        if (dev.has(sycl::aspect::ext_intel_device_info_uuid))
+        {
+            prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
+        }
+#elif defined(_MSC_VER) && !defined(__clang__)
+#pragma message("get_device_info: querying memory_clock_rate and \
+        memory_bus_width are not supported by the compiler used. \
+        Use 3200000 kHz as memory_clock_rate default value. \
+        Use 64 bits as memory_bus_width default value.")
+#else
+#warning "get_device_info: querying memory_clock_rate and \
+        memory_bus_width are not supported by the compiler used. \
+        Use 3200000 kHz as memory_clock_rate default value. \
+        Use 64 bits as memory_bus_width default value."
+#endif
+
+        size_t max_sub_group_size = 1;
+        std::vector<size_t> sub_group_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+
+        for (const auto &sub_group_size : sub_group_sizes)
+        {
+            if (max_sub_group_size < sub_group_size)
+                max_sub_group_size = sub_group_size;
+        }
+
+        prop.set_max_sub_group_size(max_sub_group_size);
+
+        prop.set_max_work_items_per_compute_unit(
+            dev.get_info<sycl::info::device::max_work_group_size>());
+        int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+        prop.set_max_nd_range_size(max_nd_range_size);
+
+        // Estimates max register size per work group, feel free to update the value
+        // according to device properties.
+        prop.set_max_register_size_per_work_group(65536);
+
+        prop.set_global_mem_cache_size(
+            dev.get_info<sycl::info::device::global_mem_cache_size>());
+        out = prop;
+    }
+
+    /// dpct device extension
+    class device_ext : public sycl::device
+    {
+        typedef std::mutex mutex_type;
+
+    public:
+        device_ext() : sycl::device(), _ctx(*this) {}
+        ~device_ext()
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            clear_queues();
+        }
+        device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            init_queues();
+        }
+
+        int is_native_atomic_supported() { return 0; }
+        int get_major_version() const
+        {
+            return dpct::get_major_version(*this);
+        }
+
+        int get_minor_version() const
+        {
+            return dpct::get_minor_version(*this);
+        }
+
+        int get_max_compute_units() const
+        {
+            return get_device_info().get_max_compute_units();
+        }
+
+        /// Return the maximum clock frequency of this device in KHz.
+        int get_max_clock_frequency() const
+        {
+            return get_device_info().get_max_clock_frequency();
+        }
+
+        int get_integrated() const { return get_device_info().get_integrated(); }
+
+        int get_max_sub_group_size() const
+        {
+            return get_device_info().get_max_sub_group_size();
+        }
+
+        int get_max_register_size_per_work_group() const
+        {
+            return get_device_info().get_max_register_size_per_work_group();
+        }
+
+        int get_max_work_group_size() const
+        {
+            return get_device_info().get_max_work_group_size();
+        }
+
+        int get_mem_base_addr_align() const
+        {
+            return get_info<sycl::info::device::mem_base_addr_align>();
+        }
+
+        size_t get_global_mem_size() const
+        {
+            return get_device_info().get_global_mem_size();
+        }
+
+        /// Get the number of bytes of free and total memory on the SYCL device.
+        /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
+        /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
+        void get_memory_info(size_t &free_memory, size_t &total_memory)
+        {
+#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
+            if (!has(sycl::aspect::ext_intel_free_memory))
+            {
+                std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
+                free_memory = 0;
+            }
+            else
+            {
+                free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
+            }
+#else
+            std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
+            free_memory = 0;
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma message("Querying the number of bytes of free memory is not supported")
+#else
+#warning "Querying the number of bytes of free memory is not supported"
+#endif
+#endif
+            total_memory = get_device_info().get_global_mem_size();
+        }
+
+        void get_device_info(device_info &out) const
+        {
+            dpct::get_device_info(out, *this);
+        }
+
+        device_info get_device_info() const
+        {
+            device_info prop;
+            dpct::get_device_info(prop, *this);
+            return prop;
+        }
+
+        void reset()
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            clear_queues();
+            init_queues();
+        }
+
+        sycl::queue &in_order_queue() { return *_q_in_order; }
+
+        sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
+
+        sycl::queue &default_queue()
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            return out_of_order_queue();
+#else
+            return in_order_queue();
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+        void queues_wait_and_throw()
+        {
+            std::unique_lock<mutex_type> lock(m_mutex);
+            std::vector<std::shared_ptr<sycl::queue>> current_queues(
+                _queues);
+            lock.unlock();
+            for (const auto &q : current_queues)
+            {
+                q->wait_and_throw();
+            }
+            // Guard the destruct of current_queues to make sure the ref count is safe.
+            lock.lock();
+        }
+
+        sycl::queue *create_queue(bool enable_exception_handler = false)
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            return create_out_of_order_queue(enable_exception_handler);
+#else
+            return create_in_order_queue(enable_exception_handler);
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+        sycl::queue *create_in_order_queue(bool enable_exception_handler = false)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            return create_queue_impl(enable_exception_handler,
+                                     sycl::property::queue::in_order());
+        }
+
+        sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            return create_queue_impl(enable_exception_handler);
+        }
+
+        void destroy_queue(sycl::queue *&queue)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                         [=](const std::shared_ptr<sycl::queue> &q) -> bool
+                                         {
+                                             return q.get() == queue;
+                                         }),
+                          _queues.end());
+            queue = nullptr;
+        }
+        void set_saved_queue(sycl::queue *q)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            _saved_queue = q;
+        }
+        sycl::queue *get_saved_queue() const
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            return _saved_queue;
+        }
+        sycl::context get_context() const { return _ctx; }
+
+    private:
+        void clear_queues()
+        {
+            _queues.clear();
+            _q_in_order = _q_out_of_order = _saved_queue = nullptr;
+        }
+
+        void init_queues()
+        {
+            _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
+            _q_out_of_order = create_queue_impl(true);
+            _saved_queue = &default_queue();
+        }
+
+        /// Caller should acquire resource \p m_mutex before calling this function.
+        template <class... Properties>
+        sycl::queue *create_queue_impl(bool enable_exception_handler,
+                                       Properties... properties)
+        {
+            sycl::async_handler eh = {};
+            if (enable_exception_handler)
+            {
+                eh = exception_handler;
+            }
+            _queues.push_back(std::make_shared<sycl::queue>(
+                _ctx, *this, eh,
+                sycl::property_list(
+#ifdef DPCT_PROFILING_ENABLED
+                    sycl::property::queue::enable_profiling(),
+#endif
+                    properties...)));
+
+            return _queues.back().get();
+        }
+
+        void get_version(int &major, int &minor) const
+        {
+            detail::get_version(*this, major, minor);
+        }
+        sycl::queue *_q_in_order, *_q_out_of_order;
+        sycl::queue *_saved_queue;
+        sycl::context _ctx;
+        std::vector<std::shared_ptr<sycl::queue>> _queues;
+        mutable mutex_type m_mutex;
+    };
+
+    /// device manager
+    class dev_mgr
+    {
+    public:
+        device_ext &current_device()
+        {
+            unsigned int dev_id = current_device_id();
+            check_id(dev_id);
+            return *_devs[dev_id];
+        }
+        device_ext &cpu_device() const
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            if (_cpu_device == -1)
+            {
+                throw std::runtime_error("no valid cpu device");
+            }
+            else
+            {
+                return *_devs[_cpu_device];
+            }
+        }
+        device_ext &get_device(unsigned int id) const
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            check_id(id);
+            return *_devs[id];
+        }
+        unsigned int current_device_id() const
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            auto it = _thread2dev_map.find(get_tid());
+            if (it != _thread2dev_map.end())
+                return it->second;
+            return DEFAULT_DEVICE_ID;
+        }
+
+        /// Select device with a device ID.
+        /// \param [in] id The id of the device which can
+        /// be obtained through get_device_id(const sycl::device).
+        void select_device(unsigned int id)
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            check_id(id);
+            _thread2dev_map[get_tid()] = id;
+        }
+        unsigned int device_count() { return _devs.size(); }
+
+        unsigned int get_device_id(const sycl::device &dev)
+        {
+            unsigned int id = 0;
+            for (auto dev_item : _devs)
+            {
+                if (*dev_item == dev)
+                {
+                    break;
+                }
+                id++;
+            }
+            return id;
+        }
+
+        template <class DeviceSelector>
+        std::enable_if_t<
+            std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
+        select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
+        {
+            sycl::device selected_device = sycl::device(selector);
+            unsigned int selected_device_id = get_device_id(selected_device);
+            select_device(selected_device_id);
+        }
+
+        /// Returns the instance of device manager singleton.
+        static dev_mgr &instance()
+        {
+            static dev_mgr d_m;
+            return d_m;
+        }
+        dev_mgr(const dev_mgr &) = delete;
+        dev_mgr &operator=(const dev_mgr &) = delete;
+        dev_mgr(dev_mgr &&) = delete;
+        dev_mgr &operator=(dev_mgr &&) = delete;
+
+    private:
+        mutable std::recursive_mutex m_mutex;
+        dev_mgr()
+        {
+            sycl::device default_device =
+                sycl::device(sycl::default_selector_v);
+            _devs.push_back(std::make_shared<device_ext>(default_device));
+
+            std::vector<sycl::device> sycl_all_devs =
+                sycl::device::get_devices(sycl::info::device_type::all);
+            // Collect other devices except for the default device.
+            if (default_device.is_cpu())
+                _cpu_device = 0;
+            for (auto &dev : sycl_all_devs)
+            {
+                if (dev == default_device)
+                {
+                    continue;
+                }
+                _devs.push_back(std::make_shared<device_ext>(dev));
+                if (_cpu_device == -1 && dev.is_cpu())
+                {
+                    _cpu_device = _devs.size() - 1;
+                }
+            }
+        }
+        void check_id(unsigned int id) const
+        {
+            if (id >= _devs.size())
+            {
+                throw std::runtime_error("invalid device id");
+            }
+        }
+        std::vector<std::shared_ptr<device_ext>> _devs;
+        /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
+        /// thread id in _thread2dev_map, which means default device should be used
+        /// for the current thread.
+        const unsigned int DEFAULT_DEVICE_ID = 0;
+        /// thread-id to device-id map.
+        std::map<unsigned int, unsigned int> _thread2dev_map;
+        int _cpu_device = -1;
+    };
+
+    static inline sycl::queue &get_default_queue()
+    {
+        return dev_mgr::instance().current_device().default_queue();
+    }
+
+    namespace detail
+    {
+        enum class pointer_access_attribute
+        {
+            host_only = 0,
+            device_only,
+            host_device,
+            end
+        };
+
+        static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
+                                                              const void *ptr)
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            return mem_mgr::instance().is_device_ptr(ptr)
+                       ? pointer_access_attribute::device_only
+                       : pointer_access_attribute::host_only;
+#else
+            switch (sycl::get_pointer_type(ptr, q.get_context()))
+            {
+            case sycl::usm::alloc::unknown:
+                return pointer_access_attribute::host_only;
+            case sycl::usm::alloc::device:
+                return pointer_access_attribute::device_only;
+            case sycl::usm::alloc::shared:
+            case sycl::usm::alloc::host:
+                return pointer_access_attribute::host_device;
+            }
+#endif
+        }
+
+        template <typename ArgT>
+        inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
+        {
+            static_assert((unsigned char)library_data_t::library_data_t_size <=
+                              std::numeric_limits<unsigned char>::max() &&
+                          "library_data_t size exceeds limit.");
+            static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
+            return (std::uint64_t)Val;
+        }
+
+        template <typename FirstT, typename... RestT>
+        inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
+                                                               RestT... RestVal)
+        {
+            static_assert((std::uint8_t)library_data_t::library_data_t_size <=
+                              std::numeric_limits<unsigned char>::max() &&
+                          "library_data_t size exceeds limit.");
+            static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
+            static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
+            return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
+        }
+
+        class mem_mgr
+        {
+            mem_mgr()
+            {
+                // Reserved address space, no real memory allocation happens here.
+#if defined(__linux__)
+                mapped_address_space =
+                    (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
+                                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#elif defined(_WIN64)
+                mapped_address_space = (byte_t *)VirtualAlloc(
+                    NULL,               // NULL specified as the base address parameter
+                    mapped_region_size, // Size of allocation
+                    MEM_RESERVE,        // Allocate reserved pages
+                    PAGE_NOACCESS);     // Protection = no access
+#else
+#error "Only support Windows and Linux."
+#endif
+                next_free = mapped_address_space;
+            };
+
+        public:
+            using buffer_id_t = int;
+
+            struct allocation
+            {
+                buffer_t buffer;
+                byte_t *alloc_ptr;
+                size_t size;
+            };
+
+            ~mem_mgr()
+            {
+#if defined(__linux__)
+                munmap(mapped_address_space, mapped_region_size);
+#elif defined(_WIN64)
+                VirtualFree(mapped_address_space, 0, MEM_RELEASE);
+#else
+#error "Only support Windows and Linux."
+#endif
+            };
+
+            mem_mgr(const mem_mgr &) = delete;
+            mem_mgr &operator=(const mem_mgr &) = delete;
+            mem_mgr(mem_mgr &&) = delete;
+            mem_mgr &operator=(mem_mgr &&) = delete;
+
+            /// Allocate
+            void *mem_alloc(size_t size)
+            {
+                if (!size)
+                    return nullptr;
+                std::lock_guard<std::mutex> lock(m_mutex);
+                if (next_free + size > mapped_address_space + mapped_region_size)
+                {
+                    throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
+                }
+                // Allocation
+                sycl::range<1> r(size);
+                buffer_t buf(r);
+                allocation A{buf, next_free, size};
+                // Map allocation to device pointer
+                void *result = next_free;
+                m_map.emplace(next_free + size, A);
+                // Update pointer to the next free space.
+                next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
+
+                return result;
+            }
+
+            /// Deallocate
+            void mem_free(const void *ptr)
+            {
+                if (!ptr)
+                    return;
+                std::lock_guard<std::mutex> lock(m_mutex);
+                auto it = get_map_iterator(ptr);
+                m_map.erase(it);
+            }
+
+            /// map: device pointer -> allocation(buffer, alloc_ptr, size)
+            allocation translate_ptr(const void *ptr)
+            {
+                std::lock_guard<std::mutex> lock(m_mutex);
+                auto it = get_map_iterator(ptr);
+                return it->second;
+            }
+
+            /// Check if the pointer represents device pointer or not.
+            bool is_device_ptr(const void *ptr) const
+            {
+                std::lock_guard<std::mutex> lock(m_mutex);
+                return (mapped_address_space <= ptr) &&
+                       (ptr < mapped_address_space + mapped_region_size);
+            }
+
+            /// Returns the instance of memory manager singleton.
+            static mem_mgr &instance()
+            {
+                static mem_mgr m;
+                return m;
+            }
+
+        private:
+            std::map<byte_t *, allocation> m_map;
+            mutable std::mutex m_mutex;
+            byte_t *mapped_address_space;
+            byte_t *next_free;
+            const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
+            const size_t alignment = 256;
+            /// This padding may be defined to some positive value to debug
+            /// out of bound accesses.
+            const size_t extra_padding = 0;
+
+            std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
+            {
+                auto it = m_map.upper_bound((byte_t *)ptr);
+                if (it == m_map.end())
+                {
+                    // Not a virtual pointer.
+                    throw std::runtime_error("can not get buffer from non-virtual pointer");
+                }
+                const allocation &alloc = it->second;
+                if (ptr < alloc.alloc_ptr)
+                {
+                    // Out of bound.
+                    // This may happen if there's a gap between allocations due to alignment
+                    // or extra padding and pointer points to this gap.
+                    throw std::runtime_error("invalid virtual pointer");
+                }
+                return it;
+            }
+        };
+
+        template <class T, memory_region Memory, size_t Dimension>
+        class accessor;
+        template <memory_region Memory, class T = byte_t>
+        class memory_traits
+        {
+        public:
+            static constexpr sycl::access::target target =
+                sycl::access::target::device;
+            static constexpr sycl::access_mode mode =
+                (Memory == constant) ? sycl::access_mode::read
+                                     : sycl::access_mode::read_write;
+            static constexpr size_t type_size = sizeof(T);
+            using element_t =
+                typename std::conditional<Memory == constant, const T, T>::type;
+            using value_t = typename std::remove_cv<T>::type;
+            template <size_t Dimension = 1>
+            using accessor_t = typename std::conditional<
+                Memory == local, sycl::local_accessor<value_t, Dimension>,
+                sycl::accessor<T, Dimension, mode, target>>::type;
+            using pointer_t = T *;
+        };
+
+        static inline void *dpct_malloc(size_t size, sycl::queue &q)
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
+#else
+            return sycl::malloc_device(size, q.get_device(), q.get_context());
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
+        static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
+                                        sycl::queue &q)
+        {
+            pitch = PITCH_DEFAULT_ALIGN(x);
+            return dpct_malloc(pitch * y * z, q);
+        }
+
+        /**
+         * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] dev_ptr Pointer to the virtual device memory address.
+         * @param [in] value The value to be set.
+         * @param [in] size Number of elements to be set to the value.
+         * @return An event representing the memset operation.
+         */
+        template <typename valueT>
+        static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
+                                              valueT value, size_t size)
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            auto &mm = mem_mgr::instance();
+            assert(mm.is_device_ptr(dev_ptr));
+            auto alloc = mm.translate_ptr(dev_ptr);
+            size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
+
+            return q.submit([&](sycl::handler &cgh)
+                            {
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    auto new_buffer = alloc.buffer.reinterpret<valueT>(
+        sycl::range<1>(alloc.size / sizeof(valueT)));
+    sycl::accessor<valueT, 1, sycl::access_mode::write,
+                sycl::access::target::device>
+        acc(new_buffer, cgh, r, o);
+    cgh.fill(acc, value); });
+#else
+            return q.fill(dev_ptr, value, size);
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+        /**
+         * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] data Pointer to the pitched device memory region.
+         * @param [in] value The value to be set.
+         * @param [in] size 3D memory region by number of elements.
+         * @return An event list representing the memset operations.
+         */
+        template <typename valueT>
+        static inline std::vector<sycl::event>
+        dpct_memset(sycl::queue &q, pitched_data data, valueT value,
+                    sycl::range<3> size)
+        {
+            std::vector<sycl::event> event_list;
+            size_t slice = data.get_pitch() * data.get_y();
+            unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
+            for (size_t z = 0; z < size.get(2); ++z)
+            {
+                unsigned char *data_ptr = data_surface;
+                for (size_t y = 0; y < size.get(1); ++y)
+                {
+                    event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
+                    data_ptr += data.get_pitch();
+                }
+                data_surface += slice;
+            }
+            return event_list;
+        }
+
+        /**
+         * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] ptr Pointer to the virtual device memory.
+         * @param [in] pitch The pitch size by number of elements, including padding.
+         * @param [in] val The value to be set.
+         * @param [in] x The width of memory region by number of elements.
+         * @param [in] y The height of memory region by number of elements.
+         * @return An event list representing the memset operations.
+         */
+        template <typename valueT>
+        static inline std::vector<sycl::event>
+        dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
+                    size_t y)
+        {
+            return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
+                               sycl::range<3>(x, y, 1));
+        }
+
+        static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
+                                                        const void *from_ptr,
+                                                        memcpy_direction dir)
+        {
+            switch (dir)
+            {
+            case memcpy_direction::host_to_host:
+            case memcpy_direction::host_to_device:
+            case memcpy_direction::device_to_host:
+            case memcpy_direction::device_to_device:
+                return dir;
+            case memcpy_direction::automatic:
+            {
+                // table[to_attribute][from_attribute]
+                static const memcpy_direction
+                    direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
+                                   [static_cast<unsigned>(pointer_access_attribute::end)] =
+                                       {{memcpy_direction::host_to_host,
+                                         memcpy_direction::device_to_host,
+                                         memcpy_direction::host_to_host},
+                                        {memcpy_direction::host_to_device,
+                                         memcpy_direction::device_to_device,
+                                         memcpy_direction::device_to_device},
+                                        {memcpy_direction::host_to_host,
+                                         memcpy_direction::device_to_device,
+                                         memcpy_direction::device_to_device}};
+                return direction_table[static_cast<unsigned>(get_pointer_attribute(
+                    q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
+            }
+            default:
+                throw std::runtime_error("dpct_memcpy: invalid direction value");
+            }
+        }
+
+        static sycl::event
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+                    memcpy_direction direction,
+                    const std::vector<sycl::event> &dep_events = {})
+        {
+            if (!size)
+                return sycl::event{};
+#ifdef DPCT_USM_LEVEL_NONE
+            auto &mm = mem_mgr::instance();
+            auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+
+            switch (real_direction)
+            {
+            case host_to_host:
+                return q.submit([&](sycl::handler &cgh)
+                                {
+    cgh.depends_on(dep_events);
+    cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
+            case host_to_device:
+            {
+                auto alloc = mm.translate_ptr(to_ptr);
+                size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
+                return q.submit([&](sycl::handler &cgh)
+                                {
+    cgh.depends_on(dep_events);
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                        sycl::access::target::device>
+        acc(alloc.buffer, cgh, r, o);
+    cgh.copy(from_ptr, acc); });
+            }
+            case device_to_host:
+            {
+                auto alloc = mm.translate_ptr(from_ptr);
+                size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
+                return q.submit([&](sycl::handler &cgh)
+                                {
+    cgh.depends_on(dep_events);
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                        sycl::access::target::device>
+        acc(alloc.buffer, cgh, r, o);
+    cgh.copy(acc, to_ptr); });
+            }
+            case device_to_device:
+            {
+                auto to_alloc = mm.translate_ptr(to_ptr);
+                auto from_alloc = mm.translate_ptr(from_ptr);
+                size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
+                size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
+                return q.submit([&](sycl::handler &cgh)
+                                {
+    cgh.depends_on(dep_events);
+    auto r = sycl::range<1>(size);
+    auto to_o = sycl::id<1>(to_offset);
+    auto from_o = sycl::id<1>(from_offset);
+    sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                        sycl::access::target::device>
+        to_acc(to_alloc.buffer, cgh, r, to_o);
+    sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                        sycl::access::target::device>
+        from_acc(from_alloc.buffer, cgh, r, from_o);
+    cgh.copy(from_acc, to_acc); });
+            }
+            default:
+                throw std::runtime_error("dpct_memcpy: invalid direction value");
+            }
+#else
+            return q.memcpy(to_ptr, from_ptr, size, dep_events);
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+        // Get actual copy range and make sure it will not exceed range.
+        static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                            size_t pitch)
+        {
+            return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+        }
+
+        static inline size_t get_offset(sycl::id<3> id, size_t slice,
+                                        size_t pitch)
+        {
+            return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+        }
+
+        /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+        /// and \p from_range to another specified by \p to_ptr and \p to_range.
+        static inline std::vector<sycl::event>
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                    sycl::range<3> to_range, sycl::range<3> from_range,
+                    sycl::id<3> to_id, sycl::id<3> from_id,
+                    sycl::range<3> size, memcpy_direction direction,
+                    const std::vector<sycl::event> &dep_events = {})
+        {
+            // RAII for host pointer
+            class host_buffer
+            {
+                void *_buf;
+                size_t _size;
+                sycl::queue &_q;
+                const std::vector<sycl::event> &_deps; // free operation depends
+
+            public:
+                host_buffer(size_t size, sycl::queue &q,
+                            const std::vector<sycl::event> &deps)
+                    : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+                void *get_ptr() const { return _buf; }
+                size_t get_size() const { return _size; }
+                ~host_buffer()
+                {
+                    if (_buf)
+                    {
+                        _q.submit([&](sycl::handler &cgh)
+                                  {
+        cgh.depends_on(_deps);
+        cgh.host_task([buf = _buf] { std::free(buf); }); });
+                    }
+                }
+            };
+            std::vector<sycl::event> event_list;
+
+            size_t to_slice = to_range.get(1) * to_range.get(0),
+                   from_slice = from_range.get(1) * from_range.get(0);
+            unsigned char *to_surface =
+                (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+            const unsigned char *from_surface =
+                (const unsigned char *)from_ptr +
+                get_offset(from_id, from_slice, from_range.get(0));
+
+            if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
+            {
+                return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                                    direction, dep_events)};
+            }
+            direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+            size_t size_slice = size.get(1) * size.get(0);
+            switch (direction)
+            {
+            case host_to_host:
+                for (size_t z = 0; z < size.get(2); ++z)
+                {
+                    unsigned char *to_ptr = to_surface;
+                    const unsigned char *from_ptr = from_surface;
+                    if (to_range.get(0) == from_range.get(0) &&
+                        to_range.get(0) == size.get(0))
+                    {
+                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
+                                                         direction, dep_events));
+                    }
+                    else
+                    {
+                        for (size_t y = 0; y < size.get(1); ++y)
+                        {
+                            event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
+                                                             direction, dep_events));
+                            to_ptr += to_range.get(0);
+                            from_ptr += from_range.get(0);
+                        }
+                    }
+                    to_surface += to_slice;
+                    from_surface += from_slice;
+                }
+                break;
+            case host_to_device:
+            {
+                host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                                event_list);
+                std::vector<sycl::event> host_events;
+                if (to_slice == size_slice)
+                {
+                    // Copy host data to a temp host buffer with the shape of target.
+                    host_events =
+                        dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                                    host_to_host, dep_events);
+                }
+                else
+                {
+                    // Copy host data to a temp host buffer with the shape of target.
+                    host_events = dpct_memcpy(
+                        q, buf.get_ptr(), from_surface, to_range, from_range,
+                        sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
+                        // If has padding data, not sure whether it is useless. So fill temp
+                        // buffer with it.
+                        std::vector<sycl::event>{
+                            dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                                        device_to_host, dep_events)});
+                }
+                // Copy from temp host buffer to device with only one submit.
+                event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
+                                                 buf.get_size(), host_to_device,
+                                                 host_events));
+                break;
+            }
+            case device_to_host:
+            {
+                host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                                event_list);
+                // Copy from host temp buffer to host target with reshaping.
+                event_list = dpct_memcpy(
+                    q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
+                    sycl::id<3>(0, 0, 0), size, host_to_host,
+                    // Copy from device to temp host buffer with only one submit.
+                    std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
+                                                         buf.get_size(),
+                                                         device_to_host, dep_events)});
+                break;
+            }
+            case device_to_device:
+#ifdef DPCT_USM_LEVEL_NONE
+            {
+                auto &mm = mem_mgr::instance();
+                auto to_alloc = mm.translate_ptr(to_surface);
+                auto from_alloc = mm.translate_ptr(from_surface);
+                size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
+                size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
+                event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                              {
+    cgh.depends_on(dep_events);
+    auto to_o = sycl::id<1>(to_offset);
+    auto from_o = sycl::id<1>(from_offset);
+    sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                        sycl::access::target::device>
+        to_acc(to_alloc.buffer, cgh,
+                get_copy_range(size, to_slice, to_range.get(0)), to_o);
+    sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                        sycl::access::target::device>
+        from_acc(from_alloc.buffer, cgh,
+                get_copy_range(size, from_slice, from_range.get(0)), from_o);
+    cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
+        size,
+        [=](sycl::id<3> id) {
+            to_acc[get_offset(id, to_slice, to_range.get(0))] =
+                from_acc[get_offset(id, from_slice, from_range.get(0))];
+        }); }));
+            }
+#else
+                event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                              {
+    cgh.depends_on(dep_events);
+    cgh.parallel_for<class dpct_memcpy_3d_detail>(
+        size,
+        [=](sycl::id<3> id) {
+            to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                from_surface[get_offset(id, from_slice, from_range.get(0))];
+        }); }));
+#endif
+            break;
+            default:
+                throw std::runtime_error("dpct_memcpy: invalid direction value");
+            }
+            return event_list;
+        }
+
+        /// memcpy 2D/3D matrix specified by pitched_data.
+        static inline std::vector<sycl::event>
+        dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                    pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                    memcpy_direction direction = automatic)
+        {
+            return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                               sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                               sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                               size, direction);
+        }
+
+        /// memcpy 2D matrix with pitch.
+        static inline std::vector<sycl::event>
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                    size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                    memcpy_direction direction = automatic)
+        {
+            return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                               sycl::range<3>(from_pitch, y, 1),
+                               sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                               sycl::range<3>(x, y, 1), direction);
+        }
+
+        namespace deprecated
+        {
+
+            template <typename T, sycl::usm::alloc AllocKind>
+            class usm_allocator
+            {
+            private:
+                using Alloc = sycl::usm_allocator<T, AllocKind>;
+                Alloc _impl;
+
+            public:
+                using value_type = typename std::allocator_traits<Alloc>::value_type;
+                using pointer = typename std::allocator_traits<Alloc>::pointer;
+                using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
+                using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
+                using const_void_pointer =
+                    typename std::allocator_traits<Alloc>::const_void_pointer;
+                using reference = typename std::allocator_traits<Alloc>::value_type &;
+                using const_reference =
+                    const typename std::allocator_traits<Alloc>::value_type &;
+                using difference_type =
+                    typename std::allocator_traits<Alloc>::difference_type;
+                using size_type = typename std::allocator_traits<Alloc>::size_type;
+                using propagate_on_container_copy_assignment = typename std::allocator_traits<
+                    Alloc>::propagate_on_container_copy_assignment;
+                using propagate_on_container_move_assignment = typename std::allocator_traits<
+                    Alloc>::propagate_on_container_move_assignment;
+                using propagate_on_container_swap =
+                    typename std::allocator_traits<Alloc>::propagate_on_container_swap;
+                using is_always_equal =
+                    typename std::allocator_traits<Alloc>::is_always_equal;
+
+                template <typename U>
+                struct rebind
+                {
+                    typedef usm_allocator<U, AllocKind> other;
+                };
+
+                usm_allocator() : _impl(dpct::get_default_queue()) {}
+                ~usm_allocator() {}
+                usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
+                usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
+                pointer address(reference r) { return &r; }
+                const_pointer address(const_reference r) { return &r; }
+                pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
+                {
+                    return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
+                }
+                void deallocate(pointer p, size_type cnt)
+                {
+                    std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
+                }
+                size_type max_size() const
+                {
+                    return std::allocator_traits<Alloc>::max_size(_impl);
+                }
+                bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
+                bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
+            };
+
+        } // namespace deprecated
+
+        inline void dpct_free(void *ptr,
+                              const sycl::queue &q)
+        {
+            if (ptr)
+            {
+#ifdef DPCT_USM_LEVEL_NONE
+                detail::mem_mgr::instance().mem_free(ptr);
+#else
+                sycl::free(ptr, q.get_context());
+#endif // DPCT_USM_LEVEL_NONE
+            }
+        }
+
+        template <typename T>
+        inline auto get_memory(const void *x)
+        {
+            T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
+#ifdef DPCT_USM_LEVEL_NONE
+            return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
+#else
+            return new_x;
+#endif
+        }
+
+        template <typename T>
+        inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
+        {
+            using Ty = typename DataType<T>::T2;
+            Ty s_h;
+            if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
+                detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
+                    .wait();
+            else
+                s_h = *reinterpret_cast<const Ty *>(s);
+            return s_h;
+        }
+
+    } // namespace detail
+
+    template <typename T>
+    inline auto get_value(const T *s, sycl::queue &q)
+    {
+        return detail::get_value(s, q);
+    }
+
+    namespace detail
+    {
+        template <class Ta, class Tb, class Tc, class Ts>
+        inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                              oneapi::mkl::transpose b_trans, int m, int n, int k,
+                              const void *alpha, const void *a, int lda, const void *b,
+                              int ldb, const void *beta, void *c, int ldc)
+        {
+#ifndef __INTEL_MKL__
+            throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                                     "Project does not support this API.");
+#else
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+            auto data_a = get_memory<const Ta>(a);
+            auto data_b = get_memory<const Tb>(b);
+            auto data_c = get_memory<Tc>(c);
+            oneapi::mkl::blas::column_major::gemm(
+                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+                data_b, ldb, beta_value, data_c, ldc);
+#endif
+        }
+
+        template <typename VecT, class BinaryOperation, class = void>
+        class vectorized_binary
+        {
+        public:
+            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
+            {
+                VecT v4;
+                for (size_t i = 0; i < v4.size(); ++i)
+                {
+                    v4[i] = binary_op(a[i], b[i]);
+                }
+                return v4;
+            }
+        };
+
+        template <typename VecT, class BinaryOperation>
+        class vectorized_binary<
+            VecT, BinaryOperation,
+            std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>>
+        {
+        public:
+            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
+            {
+                return binary_op(a, b).template as<VecT>();
+            }
+        };
+
+        template <class Ta, class Tb, class Tc, class Ts>
+        inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                                    oneapi::mkl::transpose b_trans, int m, int n, int k,
+                                    const void *alpha, const void **a, int lda,
+                                    const void **b, int ldb, const void *beta, void **c,
+                                    int ldc, int batch_size)
+        {
+            struct matrix_info_t
+            {
+                oneapi::mkl::transpose transpose_info[2];
+                Ts value_info[2];
+                std::int64_t size_info[3];
+                std::int64_t ld_info[3];
+                std::int64_t groupsize_info;
+            };
+
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+
+            matrix_info_t *matrix_info =
+                (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
+            matrix_info->transpose_info[0] = a_trans;
+            matrix_info->transpose_info[1] = b_trans;
+            matrix_info->value_info[0] = alpha_value;
+            matrix_info->value_info[1] = beta_value;
+            matrix_info->size_info[0] = m;
+            matrix_info->size_info[1] = n;
+            matrix_info->size_info[2] = k;
+            matrix_info->ld_info[0] = lda;
+            matrix_info->ld_info[1] = ldb;
+            matrix_info->ld_info[2] = ldc;
+            matrix_info->groupsize_info = batch_size;
+
+            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
+                q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
+                matrix_info->size_info, matrix_info->size_info + 1,
+                matrix_info->size_info + 2, matrix_info->value_info,
+                reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
+                reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
+                matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
+                matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
+
+            q.submit([&](sycl::handler &cgh)
+                     {
+    cgh.depends_on(e);
+    cgh.host_task([=] { std::free(matrix_info); }); });
+        }
+
+        template <class Ta, class Tb, class Tc, class Ts>
+        inline void
+        gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                        oneapi::mkl::transpose b_trans, int m, int n,
+                        int k, const void *alpha, const void *a, int lda,
+                        long long int stride_a, const void *b, int ldb,
+                        long long int stride_b, const void *beta, void *c,
+                        int ldc, long long int stride_c, int batch_size)
+        {
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+            auto data_a = get_memory<const Ta>(a);
+            auto data_b = get_memory<const Tb>(b);
+            auto data_c = get_memory<Tc>(c);
+            oneapi::mkl::blas::column_major::gemm_batch(
+                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+                stride_a, data_b, ldb, stride_b, beta_value,
+                data_c, ldc, stride_c, batch_size);
+        }
+
+    } // namespace detail
+
+    template <typename VecT, class BinaryOperation>
+    inline unsigned vectorized_binary(unsigned a, unsigned b,
+                                      const BinaryOperation binary_op)
+    {
+        sycl::vec<unsigned, 1> v0{a}, v1{b};
+        auto v2 = v0.as<VecT>();
+        auto v3 = v1.as<VecT>();
+        auto v4 =
+            detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
+        v0 = v4.template as<sycl::vec<unsigned, 1>>();
+        return v0;
+    }
+
+    static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
+                                  memcpy_direction direction = automatic,
+                                  sycl::queue &q = dpct::get_default_queue())
+    {
+        detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
+    }
+
+    static inline unsigned int select_device(unsigned int id)
+    {
+        dev_mgr::instance().select_device(id);
+        return id;
+    }
+
+    template <typename T>
+    T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
+                               int logical_sub_group_size = 32)
+    {
+        unsigned int id = g.get_local_linear_id();
+        unsigned int start_index =
+            id / logical_sub_group_size * logical_sub_group_size;
+        unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
+        return sycl::select_from_group(g, x,
+                                       target_offset < logical_sub_group_size
+                                           ? start_index + target_offset
+                                           : id);
+    }
+
+    template <typename T>
+    sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
+    {
+        return sycl::vec<T, 1>(val)
+            .template as<sycl::vec<
+                std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
+            .template convert<T>();
+    }
+
+    template <typename T1, typename T2>
+    using dot_product_acc_t =
+        std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
+                           uint32_t, int32_t>;
+
+    template <typename T1, typename T2, typename T3>
+    inline auto dp4a(T1 a, T2 b, T3 c)
+    {
+        dot_product_acc_t<T1, T2> res = c;
+        auto va = extract_and_sign_or_zero_extend4(a);
+        auto vb = extract_and_sign_or_zero_extend4(b);
+        res += va[0] * vb[0];
+        res += va[1] * vb[1];
+        res += va[2] * vb[2];
+        res += va[3] * vb[3];
+        return res;
+    }
+
+    struct sub_sat
+    {
+        template <typename T>
+        auto operator()(const T x, const T y) const
+        {
+            return sycl::sub_sat(x, y);
+        }
+    };
+
+    template <typename S, typename T>
+    inline T vectorized_min(T a, T b)
+    {
+        sycl::vec<T, 1> v0{a}, v1{b};
+        auto v2 = v0.template as<S>();
+        auto v3 = v1.template as<S>();
+        auto v4 = sycl::min(v2, v3);
+        v0 = v4.template as<sycl::vec<T, 1>>();
+        return v0;
+    }
+
+    inline float pow(const float a, const int b) { return sycl::pown(a, b); }
+    inline double pow(const double a, const int b) { return sycl::pown(a, b); }
+    inline float pow(const float a, const float b) { return sycl::pow(a, b); }
+    inline double pow(const double a, const double b) { return sycl::pow(a, b); }
+    template <typename T, typename U>
+    inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
+    pow(const T a, const U b)
+    {
+        return sycl::pow(a, static_cast<T>(b));
+    }
+    template <typename T, typename U>
+    inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
+    pow(const T a, const U b)
+    {
+        return sycl::pow(static_cast<double>(a), static_cast<double>(b));
+    }
+
+    inline double min(const double a, const float b)
+    {
+        return sycl::fmin(a, static_cast<double>(b));
+    }
+    inline double min(const float a, const double b)
+    {
+        return sycl::fmin(static_cast<double>(a), b);
+    }
+    inline float min(const float a, const float b) { return sycl::fmin(a, b); }
+    inline double min(const double a, const double b) { return sycl::fmin(a, b); }
+    inline std::uint32_t min(const std::uint32_t a, const std::int32_t b)
+    {
+        return sycl::min(a, static_cast<std::uint32_t>(b));
+    }
+    inline std::uint32_t min(const std::int32_t a, const std::uint32_t b)
+    {
+        return sycl::min(static_cast<std::uint32_t>(a), b);
+    }
+    inline std::int32_t min(const std::int32_t a, const std::int32_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::int64_t b)
+    {
+        return sycl::min(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t min(const std::int64_t a, const std::uint64_t b)
+    {
+        return sycl::min(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::int64_t min(const std::int64_t a, const std::int64_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::int32_t b)
+    {
+        return sycl::min(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t min(const std::int32_t a, const std::uint64_t b)
+    {
+        return sycl::min(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b)
+    {
+        return sycl::min(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b)
+    {
+        return sycl::min(static_cast<std::uint64_t>(a), b);
+    }
+    // max function overloads.
+    // For floating-point types, `float` or `double` arguments are acceptable.
+    // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
+    // `std::int64_t` type arguments are acceptable.
+    inline double max(const double a, const float b)
+    {
+        return sycl::fmax(a, static_cast<double>(b));
+    }
+    inline double max(const float a, const double b)
+    {
+        return sycl::fmax(static_cast<double>(a), b);
+    }
+    inline float max(const float a, const float b) { return sycl::fmax(a, b); }
+    inline double max(const double a, const double b) { return sycl::fmax(a, b); }
+    inline std::uint32_t max(const std::uint32_t a, const std::int32_t b)
+    {
+        return sycl::max(a, static_cast<std::uint32_t>(b));
+    }
+    inline std::uint32_t max(const std::int32_t a, const std::uint32_t b)
+    {
+        return sycl::max(static_cast<std::uint32_t>(a), b);
+    }
+    inline std::int32_t max(const std::int32_t a, const std::int32_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::int64_t b)
+    {
+        return sycl::max(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t max(const std::int64_t a, const std::uint64_t b)
+    {
+        return sycl::max(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::int64_t max(const std::int64_t a, const std::int64_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::int32_t b)
+    {
+        return sycl::max(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t max(const std::int32_t a, const std::uint64_t b)
+    {
+        return sycl::max(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b)
+    {
+        return sycl::max(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b)
+    {
+        return sycl::max(static_cast<std::uint64_t>(a), b);
+    }
+
+    inline void
+    has_capability_or_fail(const sycl::device &dev,
+                           const std::initializer_list<sycl::aspect> &props)
+    {
+        for (const auto &it : props)
+        {
+            if (dev.has(it))
+                continue;
+            switch (it)
+            {
+            case sycl::aspect::fp64:
+                throw std::runtime_error("'double' is not supported in '" +
+                                         dev.get_info<sycl::info::device::name>() +
+                                         "' device");
+                break;
+            case sycl::aspect::fp16:
+                throw std::runtime_error("'half' is not supported in '" +
+                                         dev.get_info<sycl::info::device::name>() +
+                                         "' device");
+                break;
+            default:
+#define __SYCL_ASPECT(ASPECT, ID) \
+    case sycl::aspect::ASPECT:    \
+        return #ASPECT;
+#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
+#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
+                auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
+                {
+                    switch (AspectNum)
+                    {
+#include <sycl/info/aspects.def>
+#include <sycl/info/aspects_deprecated.def>
+                    default:
+                        return "unknown aspect";
+                    }
+                };
+#undef __SYCL_ASPECT_DEPRECATED_ALIAS
+#undef __SYCL_ASPECT_DEPRECATED
+#undef __SYCL_ASPECT
+                throw std::runtime_error(
+                    "'" + getAspectNameStr(it) + "' is not supported in '" +
+                    dev.get_info<sycl::info::device::name>() + "' device");
+            }
+            break;
+        }
+    }
+
+    static inline unsigned int get_current_device_id()
+    {
+        return dev_mgr::instance().current_device_id();
+    }
+
+    static inline device_ext &get_current_device()
+    {
+        return dev_mgr::instance().current_device();
+    }
+
+    static inline sycl::queue &get_in_order_queue()
+    {
+        return dev_mgr::instance().current_device().in_order_queue();
+    }
+
+    static sycl::event
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+                memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
+    {
+        if (!size)
+            return sycl::event{};
+#ifdef DPCT_USM_LEVEL_NONE
+        auto &mm = mem_mgr::instance();
+        auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+
+        switch (real_direction)
+        {
+        case host_to_host:
+            return q.submit([&](sycl::handler &cgh)
+                            {
+        cgh.depends_on(dep_events);
+        cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
+        case host_to_device:
+        {
+            auto alloc = mm.translate_ptr(to_ptr);
+            size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
+            return q.submit([&](sycl::handler &cgh)
+                            {
+        cgh.depends_on(dep_events);
+        auto r = sycl::range<1>(size);
+        auto o = sycl::id<1>(offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                            sycl::access::target::device>
+            acc(alloc.buffer, cgh, r, o);
+        cgh.copy(from_ptr, acc); });
+        }
+        case device_to_host:
+        {
+            auto alloc = mm.translate_ptr(from_ptr);
+            size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
+            return q.submit([&](sycl::handler &cgh)
+                            {
+        cgh.depends_on(dep_events);
+        auto r = sycl::range<1>(size);
+        auto o = sycl::id<1>(offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                            sycl::access::target::device>
+            acc(alloc.buffer, cgh, r, o);
+        cgh.copy(acc, to_ptr); });
+        }
+        case device_to_device:
+        {
+            auto to_alloc = mm.translate_ptr(to_ptr);
+            auto from_alloc = mm.translate_ptr(from_ptr);
+            size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
+            size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
+            return q.submit([&](sycl::handler &cgh)
+                            {
+        cgh.depends_on(dep_events);
+        auto r = sycl::range<1>(size);
+        auto to_o = sycl::id<1>(to_offset);
+        auto from_o = sycl::id<1>(from_offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                            sycl::access::target::device>
+            to_acc(to_alloc.buffer, cgh, r, to_o);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                            sycl::access::target::device>
+            from_acc(from_alloc.buffer, cgh, r, from_o);
+        cgh.copy(from_acc, to_acc); });
+        }
+        default:
+            throw std::runtime_error("dpct_memcpy: invalid direction value");
+        }
+#else
+        return q.memcpy(to_ptr, from_ptr, size, dep_events);
+#endif // DPCT_USM_LEVEL_NONE
+    }
+
+    // Get actual copy range and make sure it will not exceed range.
+    static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                        size_t pitch)
+    {
+        return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+    }
+
+    static inline size_t get_offset(sycl::id<3> id, size_t slice,
+                                    size_t pitch)
+    {
+        return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+    }
+
+    /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+    /// and \p from_range to another specified by \p to_ptr and \p to_range.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                sycl::range<3> to_range, sycl::range<3> from_range,
+                sycl::id<3> to_id, sycl::id<3> from_id,
+                sycl::range<3> size, memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
+    {
+        // RAII for host pointer
+        class host_buffer
+        {
+            void *_buf;
+            size_t _size;
+            sycl::queue &_q;
+            const std::vector<sycl::event> &_deps; // free operation depends
+
+        public:
+            host_buffer(size_t size, sycl::queue &q,
+                        const std::vector<sycl::event> &deps)
+                : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+            void *get_ptr() const { return _buf; }
+            size_t get_size() const { return _size; }
+            ~host_buffer()
+            {
+                if (_buf)
+                {
+                    _q.submit([&](sycl::handler &cgh)
+                              {
+            cgh.depends_on(_deps);
+            cgh.host_task([buf = _buf] { std::free(buf); }); });
+                }
+            }
+        };
+        std::vector<sycl::event> event_list;
+
+        size_t to_slice = to_range.get(1) * to_range.get(0),
+               from_slice = from_range.get(1) * from_range.get(0);
+        unsigned char *to_surface =
+            (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+        const unsigned char *from_surface =
+            (const unsigned char *)from_ptr +
+            get_offset(from_id, from_slice, from_range.get(0));
+
+        if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
+        {
+            return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                                direction, dep_events)};
+        }
+        direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+        size_t size_slice = size.get(1) * size.get(0);
+        switch (direction)
+        {
+        case host_to_host:
+            for (size_t z = 0; z < size.get(2); ++z)
+            {
+                unsigned char *to_ptr = to_surface;
+                const unsigned char *from_ptr = from_surface;
+                if (to_range.get(0) == from_range.get(0) &&
+                    to_range.get(0) == size.get(0))
+                {
+                    event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
+                                                     direction, dep_events));
+                }
+                else
+                {
+                    for (size_t y = 0; y < size.get(1); ++y)
+                    {
+                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
+                                                         direction, dep_events));
+                        to_ptr += to_range.get(0);
+                        from_ptr += from_range.get(0);
+                    }
+                }
+                to_surface += to_slice;
+                from_surface += from_slice;
+            }
+            break;
+        case host_to_device:
+        {
+            host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                            event_list);
+            std::vector<sycl::event> host_events;
+            if (to_slice == size_slice)
+            {
+                // Copy host data to a temp host buffer with the shape of target.
+                host_events =
+                    dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                                sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                                host_to_host, dep_events);
+            }
+            else
+            {
+                // Copy host data to a temp host buffer with the shape of target.
+                host_events = dpct_memcpy(
+                    q, buf.get_ptr(), from_surface, to_range, from_range,
+                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
+                    // If has padding data, not sure whether it is useless. So fill temp
+                    // buffer with it.
+                    std::vector<sycl::event>{
+                        dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                                    device_to_host, dep_events)});
+            }
+            // Copy from temp host buffer to device with only one submit.
+            event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
+                                             buf.get_size(), host_to_device,
+                                             host_events));
+            break;
+        }
+        case device_to_host:
+        {
+            host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                            event_list);
+            // Copy from host temp buffer to host target with reshaping.
+            event_list = dpct_memcpy(
+                q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
+                sycl::id<3>(0, 0, 0), size, host_to_host,
+                // Copy from device to temp host buffer with only one submit.
+                std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
+                                                     buf.get_size(),
+                                                     device_to_host, dep_events)});
+            break;
+        }
+        case device_to_device:
+#ifdef DPCT_USM_LEVEL_NONE
+        {
+            auto &mm = mem_mgr::instance();
+            auto to_alloc = mm.translate_ptr(to_surface);
+            auto from_alloc = mm.translate_ptr(from_surface);
+            size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
+            size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
+            event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                          {
+        cgh.depends_on(dep_events);
+        auto to_o = sycl::id<1>(to_offset);
+        auto from_o = sycl::id<1>(from_offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                            sycl::access::target::device>
+            to_acc(to_alloc.buffer, cgh,
+                    get_copy_range(size, to_slice, to_range.get(0)), to_o);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                            sycl::access::target::device>
+            from_acc(from_alloc.buffer, cgh,
+                    get_copy_range(size, from_slice, from_range.get(0)), from_o);
+        cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
+            size,
+            [=](sycl::id<3> id) {
+                to_acc[get_offset(id, to_slice, to_range.get(0))] =
+                    from_acc[get_offset(id, from_slice, from_range.get(0))];
+            }); }));
+        }
+#else
+            event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                          {
+        cgh.depends_on(dep_events);
+        cgh.parallel_for<class dpct_memcpy_3d_detail>(
+            size,
+            [=](sycl::id<3> id) {
+                to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                    from_surface[get_offset(id, from_slice, from_range.get(0))];
+            }); }));
+#endif
+        break;
+        default:
+            throw std::runtime_error("dpct_memcpy: invalid direction value");
+        }
+        return event_list;
+    }
+
+    /// memcpy 2D/3D matrix specified by pitched_data.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                memcpy_direction direction = automatic)
+    {
+        return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                           sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                           sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                           size, direction);
+    }
+
+    /// memcpy 2D matrix with pitch.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                memcpy_direction direction = automatic)
+    {
+        return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                           sycl::range<3>(from_pitch, y, 1),
+                           sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                           sycl::range<3>(x, y, 1), direction);
+    }
+
+    inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                     oneapi::mkl::transpose b_trans, int m, int n, int k,
+                     const void *alpha, const void *a, library_data_t a_type,
+                     int lda, const void *b, library_data_t b_type, int ldb,
+                     const void *beta, void *c, library_data_t c_type, int ldc,
+                     library_data_t scaling_type)
+    {
+        bool matched = false;
+        if (scaling_type == library_data_t::real_float &&
+            c_type == library_data_t::complex_float)
+        {
+            scaling_type = library_data_t::complex_float;
+        }
+        else if (scaling_type == library_data_t::real_double &&
+                 c_type == library_data_t::complex_double)
+        {
+            scaling_type = library_data_t::complex_double;
+        }
+
+        std::uint64_t key =
+            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+        switch (key)
+        {
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+            library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+            library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_impl<std::complex<float>, std::complex<float>,
+                              std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+            library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_impl<std::complex<double>, std::complex<double>,
+                              std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
+                                          lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
+                                     ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                              sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
+                                          a, lda, b, ldb, &beta_half, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                              oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_int32, library_data_t::real_int32):
+        {
+            float alpha_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+            float beta_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+            detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
+                q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
+        }
+    } // gemm()
+
+    /// Computes a batch of matrix-matrix product with general matrices.
+    /// \param [in] q The queue where the routine should be executed.
+    /// \param [in] a_trans Specifies the operation applied to A.
+    /// \param [in] b_trans Specifies the operation applied to B.
+    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+    /// \param [in] alpha Scaling factor for the matrix-matrix product.
+    /// \param [in] a Input matrix A.
+    /// \param [in] a_type Data type of the matrix A.
+    /// \param [in] lda Leading dimension of A.
+    /// \param [in] b Input matrix B.
+    /// \param [in] b_type Data type of the matrix B.
+    /// \param [in] ldb Leading dimension of B.
+    /// \param [in] beta Scaling factor for matrix C.
+    /// \param [in, out] c Input/Output matrix C.
+    /// \param [in] c_type Data type of the matrix C.
+    /// \param [in] ldc Leading dimension of C.
+    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+    /// \param [in] scaling_type Data type of the scaling factors.
+    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                           oneapi::mkl::transpose b_trans, int m, int n, int k,
+                           const void *alpha, const void *a[],
+                           library_data_t a_type, int lda, const void *b[],
+                           library_data_t b_type, int ldb, const void *beta,
+                           void *c[], library_data_t c_type, int ldc,
+                           int batch_size, library_data_t scaling_type)
+    {
+#ifdef DPCT_USM_LEVEL_NONE
+        throw std::runtime_error("this API is unsupported when USM level is none");
+#else
+        bool matched = false;
+        if (scaling_type == library_data_t::real_float &&
+            c_type == library_data_t::complex_float)
+        {
+            scaling_type = library_data_t::complex_float;
+        }
+        else if (scaling_type == library_data_t::real_double &&
+                 c_type == library_data_t::complex_double)
+        {
+            scaling_type = library_data_t::complex_double;
+        }
+
+        std::uint64_t key =
+            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+        switch (key)
+        {
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+            library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_batch_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+            library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                                    std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+            library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                                    std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                                a, lda, b, ldb, beta, c, ldc,
+                                                batch_size);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                                    oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                           b, ldb, beta, c, ldc, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_int32, library_data_t::real_int32):
+        {
+            float alpha_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+            float beta_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                                    float>(q, a_trans, b_trans, m, n, k, &alpha_float,
+                                           a, lda, b, ldb, &beta_float, c, ldc,
+                                           batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+#endif
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
+                batch_size);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
+        }
+#endif
+    }
+
+    /// Computes a batch of matrix-matrix product with general matrices.
+    /// \param [in] q The queue where the routine should be executed.
+    /// \param [in] a_trans Specifies the operation applied to A.
+    /// \param [in] b_trans Specifies the operation applied to B.
+    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+    /// \param [in] alpha Scaling factor for the matrix-matrix product.
+    /// \param [in] a Input matrix A.
+    /// \param [in] a_type Data type of the matrix A.
+    /// \param [in] lda Leading dimension of A.
+    /// \param [in] stride_a Stride between the different A matrices.
+    /// \param [in] b Input matrix B.
+    /// \param [in] b_type Data type of the matrix B.
+    /// \param [in] ldb Leading dimension of B.
+    /// \param [in] stride_b Stride between the different B matrices.
+    /// \param [in] beta Scaling factor for matrix C.
+    /// \param [in, out] c Input/Output matrix C.
+    /// \param [in] c_type Data type of the matrix C.
+    /// \param [in] ldc Leading dimension of C.
+    /// \param [in] stride_c Stride between the different C matrices.
+    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+    /// \param [in] scaling_type Data type of the scaling factors.
+    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                           oneapi::mkl::transpose b_trans, int m, int n, int k,
+                           const void *alpha, const void *a, library_data_t a_type,
+                           int lda, long long int stride_a, const void *b,
+                           library_data_t b_type, int ldb, long long int stride_b,
+                           const void *beta, void *c, library_data_t c_type,
+                           int ldc, long long int stride_c, int batch_size,
+                           library_data_t scaling_type)
+    {
+        bool matched = false;
+        if (scaling_type == library_data_t::real_float &&
+            c_type == library_data_t::complex_float)
+        {
+            scaling_type = library_data_t::complex_float;
+        }
+        else if (scaling_type == library_data_t::real_double &&
+                 c_type == library_data_t::complex_double)
+        {
+            scaling_type = library_data_t::complex_double;
+        }
+
+        std::uint64_t key =
+            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+        switch (key)
+        {
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+            library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_batch_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+            library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                                    std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+            library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                                    std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                                a, lda, stride_a, b, ldb, stride_b,
+                                                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                                    oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                           stride_a, b, ldb, stride_b, beta, c, ldc,
+                                           stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_int32, library_data_t::real_int32):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                                    std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
+                                                  a, lda, stride_a, b, ldb, stride_b,
+                                                  beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+#endif
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
+                &beta_half, c, ldc, stride_c, batch_size);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
+        }
+    }
+
+    static inline void
+    async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
+                      size_t from_pitch, size_t x, size_t y,
+                      memcpy_direction direction = automatic,
+                      sycl::queue &q = get_default_queue())
+    {
+        detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
+                            direction);
+    }
+
+    using err0 = detail::generic_error_type<struct err0_tag, int>;
+    using err1 = detail::generic_error_type<struct err1_tag, int>;
+
+} // COPY from DPCT head files
+
+#endif
\ No newline at end of file

From ca2cb6982a37ffff5371132c8688ce8142f974fb Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 13:42:24 +0800
Subject: [PATCH 22/90] update readme, refactor build script

---
 README_sycl.md            | 205 +++++++++++++++++++-------------------
 setup.sh => sycl_build.sh |  11 +-
 2 files changed, 109 insertions(+), 107 deletions(-)
 rename setup.sh => sycl_build.sh (62%)

diff --git a/README_sycl.md b/README_sycl.md
index 83b8a40ccc5ce..6ab0deb7f74da 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -8,179 +8,178 @@ oneAPI is a specification that is open and standards-based, supporting multiple
 
 Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
 
-This project is migrated the CUDA code to SYCL to support Intel CPU, GPU and FPGA.
-
-But we focus on GPU performance tuning. If you want to run llama.cpp on Intel CPU, please use llama.cpp CPU release.
 
 ## llama.cpp for SYCL
 
-We migrate the CUDA code SYCL. So the SYCL code replace the CUDA funcitions in llama.cpp, without function name change.
-
-That's why the code macro and log incudes CUBLAS flags.
-
-## OS
-
-### Linux
+To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
 
-In Linux, we reuse the CMAKE system of base. It's same as base llama.cpp.
+The llama.cpp for SYCL is used to support Intel GPUs.
 
-Except branch "windows", other branches are for Linux.
+For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
 
-### Windows
+## OS
 
-In Windows, we change the C source files to meet the requirement of C++ compilers.
+|OS|Status|Verified|
+|-|-|-|
+|Linux|Support|Ubuntu 22.04|
+|Windows|Ongoing| |
 
-So the code is saved in branch **windows** only.
 
-It will output 1 execute file: **llamap.cpp.sycl.exe**.
+## Intel GPU
 
-If you want to get more binary files, please change the build prject.
+|Intel GPU| Status | Verified Model|
+|-|-|-|
+|Intel Data Center Max Series| Support| Max 1550|
+|Intel Data Center Flex Series| Support| Flex 170|
+|Intel Arc Series| Support| Arc 770|
+|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
+|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
 
 
 ## Linux
 
 ### Setup Environment
 
-1. Install Intel oneAPI Base toolkit.
+1. Install Intel GPU driver.
 
-2. Build locally:
+a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
 
-```
-mkdir -p build
-cd build
-source /opt/intel/oneapi/setvars.sh
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-cmake --build . --config Release -v
-```
+Note: for iGPU, please install the client GPU driver.
 
-or
+b. Add user to group: video, render.
 
 ```
-./setup.sh
+sudo usermod -aG render username
+sudo usermod -aG video username
 ```
 
-### Run
+Note: re-login to enable it.
 
-#### Check device id
-
-Run without parameter:
+c. Check
 
 ```
-export GGML_SYCL_LIST_DEVICE=1
-./build/bin/main
+sudo apt install clinfo
+sudo clinfo -l
 ```
 
-Check the id in startup log, like:
+Output (example):
 
 ```
-SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3
-  Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0
-  Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0
-  Device 4: Intel(R) UHD Graphics 770, compute capability 3.0
-  Device 5: Intel(R) UHD Graphics 770, compute capability 1.3
-Exit for list devices task. unset GGML_SYCL_LIST_DEVICE to restore LLM work!
-```
+Platform #0: Intel(R) OpenCL Graphics
+ `-- Device #0: Intel(R) Arc(TM) A770 Graphics
 
-#### Put model file to folder **models**
 
-#### Modify run.sh
-
-Up run.sh as above info:
-```
-...
-GGML_SYCL_DEVICE=0
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
-```
-
-#### Run
+Platform #0: Intel(R) OpenCL HD Graphics
+ `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```
-./run.sh
-```
-
-
-## Windows
 
-### Setup Environment
+2. Install Intel® oneAPI Base toolkit.
 
-1. Install MS Visual Studio 2022.
 
-2. Install Intel oneAPI Base toolkit.
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ] (https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
 
-a. Recommend to install all components and with **default path**.
+Recommend to install to default folder: **/opt/intel/oneapi**.
 
-b. During installation, please choose option to enable compiler in MS Visual Studio.
+Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
 
-3. Code
+b. Check
 
-Swith to branch **windows**.
+```
+source /opt/intel/oneapi/setvars.sh
 
-Open **llama.cpp.sycl.sln** by Visual Studio 2022.
+sycl-ls
+```
 
-4. Set oneAPI Path (optional)
+There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
 
-If you chagne the oneAPI installation target path, please modify the oneAPI path in the Visual Studio.
+Output (example):
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
 
-Else, skip this step.
+```
 
-### Build
+2. Build locally:
 
-Build by visual Studio 2022 with x64 & Release.
+```
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
 
-There will be execute file: **llama.cpp.sycl.exe**.
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 
-It will take long time to build due to enable AOT on all hardware flatforms (CPU, GPU, FPGA) as default.
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
-To short it, change AOT target flatforms to one in Visual Studio 2022: **Specify SYCL offloading targets for AOT compilition**.
+#build example/main only
+#cmake --build . --config Release --target main
 
-#### Run
+#build all binary
+cmake --build . --config Release -v
 
-#### Enable oneAPI Environment
+```
 
-Run the command in command line or powershell.
+or
 
-'C:\Program Files (x86)\Intel\oneAPI\setvars.bat'
+```
+./sycl_build.sh
+```
 
-##### Check device id
+### Run
 
+1. List device ID
 
 Run without parameter:
 
 ```
-set GGML_SYCL_LIST_DEVICE=1
-.\x64\Release\llama.cpp.sycl.exe
+./build/bin/main
 ```
-Check the id in startup log, like:
+
+Check the ID in startup log, like:
 
 ```
-SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3
-  Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0
-  Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0
-  Device 4: Intel(R) UHD Graphics 770, compute capability 3.0
-  Device 5: Intel(R) UHD Graphics 770, compute capability 1.3
-Exit for list devices task. unset GGML_SYCL_LIST_DEVICE to restore LLM work!
+found 4 SYCL devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+	max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+	max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+
 ```
 
-#### Put model file to folder **models**
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+
+2. Put model file to folder **models**
 
-#### Modify run.sh
+3. Enable oneAPI running environment
 
-Up run.sh as above info:
 ```
-...
-set GGML_SYCL_DEVICE=0
+source /opt/intel/oneapi/setvars.sh
+```
+
+4. Set device ID and execute llama.cpp
+
+Set device ID = 0 by **GGML_SYCL_DEVICE=0**
 
-.\x64\Release\llama.cpp.sycl.exe -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33
 ```
+GGML_SYCL_DEVICE=0 && ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+5. Check the device ID in output
 
-#### Run
+Like：
 ```
-.\run.bat
+Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 ```
 
+
 ### Environment Variable
 
 #### Build
@@ -198,7 +197,5 @@ set GGML_SYCL_DEVICE=0
 
 |Name|Value|Function|
 |-|-|-|
-|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. List the device ids by **GGML_SYCL_LIST_DEVICE**|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log funciton by macro: GGML_SYCL_DEBUG|
-|GGML_SYCL_LIST_DEVICE|0 (default) or 1|List the device ids only|
-
+|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
+|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
\ No newline at end of file
diff --git a/setup.sh b/sycl_build.sh
similarity index 62%
rename from setup.sh
rename to sycl_build.sh
index 0a071c984b270..8f2e1d22da08d 100755
--- a/setup.sh
+++ b/sycl_build.sh
@@ -2,8 +2,13 @@ mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh
 
-#cmake .. -DLLAMA_CLBLAST=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
+#for FP16
 #cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-cmake --build . --config Release --target main
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v

From 95daece908183a0a2ce4b784daed48e393e0e5ce Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 14:18:19 +0800
Subject: [PATCH 23/90] fix build with sycl

---
 tests/test-sampling.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 32e58941c0ee0..e9dd6af1b7454 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -7,7 +7,11 @@
 
 #include <cmath>
 #include <numeric>
+
+#ifndef GGML_USE_SYCL
 #include <cassert>
+#endif
+
 #include <vector>
 #include <algorithm>
 

From a8936f49022d585a3857e9606fce0799926169bb Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 14:33:52 +0800
Subject: [PATCH 24/90] set nthread=1 when sycl, increase performance

---
 ggml.h    | 1 +
 llama.cpp | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml.h b/ggml.h
index 533f40c9f8f16..5a173d362c79f 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2267,6 +2267,7 @@ extern "C" {
     GGML_API int ggml_cpu_has_gpublas    (void);
     GGML_API int ggml_cpu_has_sse3       (void);
     GGML_API int ggml_cpu_has_ssse3      (void);
+    GGML_API int ggml_cpu_has_sycl       (void);
     GGML_API int ggml_cpu_has_vsx        (void);
 
     //
diff --git a/llama.cpp b/llama.cpp
index 61bafc9d208d6..eb9426f444a2c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6703,7 +6703,7 @@ static int llama_decode_internal(
     }
 
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
-    if (ggml_cpu_has_cublas() && fully_offloaded) {
+    if ((ggml_cpu_has_cublas() || ggml_cpu_has_sycl()) && fully_offloaded) {
         n_threads = 1;
     }
 

From 79d30d77132668c1d4d5213a8b09e4f385eb903c Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 15:18:03 +0800
Subject: [PATCH 25/90] add run script, comment debug code

---
 README_sycl.md                        | 6 ++++++
 run.sh => examples/sycl_run_llama2.sh | 0
 ggml-sycl.cpp                         | 9 +++------
 3 files changed, 9 insertions(+), 6 deletions(-)
 rename run.sh => examples/sycl_run_llama2.sh (100%)

diff --git a/README_sycl.md b/README_sycl.md
index 6ab0deb7f74da..932b0e9293724 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -172,6 +172,12 @@ Set device ID = 0 by **GGML_SYCL_DEVICE=0**
 ```
 GGML_SYCL_DEVICE=0 && ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```
+or run by script:
+
+```
+./examples/sycl_run_llama2.sh
+```
+
 5. Check the device ID in output
 
 Like：
diff --git a/run.sh b/examples/sycl_run_llama2.sh
similarity index 100%
rename from run.sh
rename to examples/sycl_run_llama2.sh
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index ba0c67d4398e8..2e6e686505232 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -10239,12 +10239,9 @@ static void ggml_sycl_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg
 static void ggml_sycl_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s\n", __func__);
     ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rms_norm);
-    log_tensor_with_cnt("log_ggml_sycl_rms_norm_src0", (struct ggml_tensor *)src0, 6);
-    log_tensor_with_cnt("log_ggml_sycl_rms_norm_src1", (struct ggml_tensor *)src1, 6);
-    log_tensor_with_cnt("log_ggml_sycl_rms_norm_dst", dst, 6);
-
-    // int *ptr = NULL;
-    // *ptr = 0;
+    // log_tensor_with_cnt("log_ggml_sycl_rms_norm_src0", (struct ggml_tensor *)src0, 6);
+    // log_tensor_with_cnt("log_ggml_sycl_rms_norm_src1", (struct ggml_tensor *)src1, 6);
+    // log_tensor_with_cnt("log_ggml_sycl_rms_norm_dst", dst, 6);
 }
 
 bool ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {

From 0d6e7219b6a383ff97758988bf7a73b031ce852e Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 15:59:43 +0800
Subject: [PATCH 26/90] add ls-sycl-device tool

---
 README_sycl.md                                |   22 +-
 examples/CMakeLists.txt                       |    3 +-
 examples/sycl/CMakeCache.txt                  |  369 +++++
 .../CMakeFiles/3.22.1/CMakeCCompiler.cmake    |   72 +
 .../CMakeFiles/3.22.1/CMakeCXXCompiler.cmake  |   83 ++
 .../sycl/CMakeFiles/3.22.1/CMakeSystem.cmake  |   15 +
 .../3.22.1/CompilerIdC/CMakeCCompilerId.c     |  803 +++++++++++
 .../sycl/CMakeFiles/3.22.1/CompilerIdC/a.out  |  Bin 0 -> 16256 bytes
 .../CompilerIdCXX/CMakeCXXCompilerId.cpp      |  791 +++++++++++
 .../CMakeFiles/3.22.1/CompilerIdCXX/a.out     |  Bin 0 -> 16264 bytes
 .../CMakeDirectoryInformation.cmake           |   16 +
 examples/sycl/CMakeFiles/Makefile.cmake       |  130 ++
 examples/sycl/CMakeFiles/Makefile2            | 1239 +++++++++++++++++
 examples/sycl/CMakeFiles/Progress/1           |    1 +
 examples/sycl/CMakeFiles/Progress/count.txt   |    1 +
 .../sycl/CMakeFiles/TargetDirectories.txt     |  183 +++
 examples/sycl/CMakeFiles/cmake.check_cache    |    1 +
 examples/sycl/CMakeFiles/progress.marks       |    1 +
 examples/sycl/CMakeLists.txt                  |    5 +
 examples/sycl/Makefile                        |  567 ++++++++
 examples/sycl/README.md                       |   47 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../baby-llama.dir/DependInfo.cmake           |   19 +
 .../CMakeFiles/baby-llama.dir/build.make      |  110 ++
 .../baby-llama.dir/cmake_clean.cmake          |   11 +
 .../baby-llama.dir/compiler_depend.make       |    2 +
 .../baby-llama.dir/compiler_depend.ts         |    2 +
 .../CMakeFiles/baby-llama.dir/depend.make     |    2 +
 .../CMakeFiles/baby-llama.dir/flags.make      |   10 +
 .../CMakeFiles/baby-llama.dir/link.txt        |    1 +
 .../CMakeFiles/baby-llama.dir/progress.make   |    3 +
 .../sycl/baby-llama/CMakeFiles/progress.marks |    1 +
 examples/sycl/baby-llama/Makefile             |  222 +++
 examples/sycl/baby-llama/cmake_install.cmake  |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../batched-bench.dir/DependInfo.cmake        |   19 +
 .../CMakeFiles/batched-bench.dir/build.make   |  110 ++
 .../batched-bench.dir/cmake_clean.cmake       |   11 +
 .../batched-bench.dir/compiler_depend.make    |    2 +
 .../batched-bench.dir/compiler_depend.ts      |    2 +
 .../CMakeFiles/batched-bench.dir/depend.make  |    2 +
 .../CMakeFiles/batched-bench.dir/flags.make   |   10 +
 .../CMakeFiles/batched-bench.dir/link.txt     |    1 +
 .../batched-bench.dir/progress.make           |    3 +
 .../batched-bench/CMakeFiles/progress.marks   |    1 +
 examples/sycl/batched-bench/Makefile          |  222 +++
 .../sycl/batched-bench/cmake_install.cmake    |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/batched.dir/DependInfo.cmake   |   19 +
 .../batched/CMakeFiles/batched.dir/build.make |  110 ++
 .../CMakeFiles/batched.dir/cmake_clean.cmake  |   11 +
 .../batched.dir/compiler_depend.make          |    2 +
 .../CMakeFiles/batched.dir/compiler_depend.ts |    2 +
 .../CMakeFiles/batched.dir/depend.make        |    2 +
 .../batched/CMakeFiles/batched.dir/flags.make |   10 +
 .../batched/CMakeFiles/batched.dir/link.txt   |    1 +
 .../CMakeFiles/batched.dir/progress.make      |    3 +
 .../sycl/batched/CMakeFiles/progress.marks    |    1 +
 examples/sycl/batched/Makefile                |  222 +++
 examples/sycl/batched/cmake_install.cmake     |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../beam-search.dir/DependInfo.cmake          |   19 +
 .../CMakeFiles/beam-search.dir/build.make     |  110 ++
 .../beam-search.dir/cmake_clean.cmake         |   11 +
 .../beam-search.dir/compiler_depend.make      |    2 +
 .../beam-search.dir/compiler_depend.ts        |    2 +
 .../CMakeFiles/beam-search.dir/depend.make    |    2 +
 .../CMakeFiles/beam-search.dir/flags.make     |   10 +
 .../CMakeFiles/beam-search.dir/link.txt       |    1 +
 .../CMakeFiles/beam-search.dir/progress.make  |    3 +
 .../beam-search/CMakeFiles/progress.marks     |    1 +
 examples/sycl/beam-search/Makefile            |  222 +++
 examples/sycl/beam-search/cmake_install.cmake |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/benchmark.dir/DependInfo.cmake |   19 +
 .../CMakeFiles/benchmark.dir/build.make       |  110 ++
 .../benchmark.dir/cmake_clean.cmake           |   11 +
 .../benchmark.dir/compiler_depend.make        |    2 +
 .../benchmark.dir/compiler_depend.ts          |    2 +
 .../CMakeFiles/benchmark.dir/depend.make      |    2 +
 .../CMakeFiles/benchmark.dir/flags.make       |   10 +
 .../CMakeFiles/benchmark.dir/link.txt         |    1 +
 .../CMakeFiles/benchmark.dir/progress.make    |    3 +
 .../sycl/benchmark/CMakeFiles/progress.marks  |    1 +
 examples/sycl/benchmark/Makefile              |  222 +++
 examples/sycl/benchmark/cmake_install.cmake   |   60 +
 examples/sycl/cmake_install.cmake             |   84 ++
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../DependInfo.cmake                          |   19 +
 .../convert-llama2c-to-ggml.dir/build.make    |  110 ++
 .../cmake_clean.cmake                         |   11 +
 .../compiler_depend.make                      |    2 +
 .../compiler_depend.ts                        |    2 +
 .../convert-llama2c-to-ggml.dir/depend.make   |    2 +
 .../convert-llama2c-to-ggml.dir/flags.make    |   10 +
 .../convert-llama2c-to-ggml.dir/link.txt      |    1 +
 .../convert-llama2c-to-ggml.dir/progress.make |    3 +
 .../CMakeFiles/progress.marks                 |    1 +
 .../sycl/convert-llama2c-to-ggml/Makefile     |  222 +++
 .../cmake_install.cmake                       |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/embedding.dir/DependInfo.cmake |   19 +
 .../CMakeFiles/embedding.dir/build.make       |  110 ++
 .../embedding.dir/cmake_clean.cmake           |   11 +
 .../embedding.dir/compiler_depend.make        |    2 +
 .../embedding.dir/compiler_depend.ts          |    2 +
 .../CMakeFiles/embedding.dir/depend.make      |    2 +
 .../CMakeFiles/embedding.dir/flags.make       |   10 +
 .../CMakeFiles/embedding.dir/link.txt         |    1 +
 .../CMakeFiles/embedding.dir/progress.make    |    3 +
 .../sycl/embedding/CMakeFiles/progress.marks  |    1 +
 examples/sycl/embedding/Makefile              |  222 +++
 examples/sycl/embedding/cmake_install.cmake   |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../export-lora.dir/DependInfo.cmake          |   19 +
 .../CMakeFiles/export-lora.dir/build.make     |  110 ++
 .../export-lora.dir/cmake_clean.cmake         |   11 +
 .../export-lora.dir/compiler_depend.make      |    2 +
 .../export-lora.dir/compiler_depend.ts        |    2 +
 .../CMakeFiles/export-lora.dir/depend.make    |    2 +
 .../CMakeFiles/export-lora.dir/flags.make     |   10 +
 .../CMakeFiles/export-lora.dir/link.txt       |    1 +
 .../CMakeFiles/export-lora.dir/progress.make  |    3 +
 .../export-lora/CMakeFiles/progress.marks     |    1 +
 examples/sycl/export-lora/Makefile            |  222 +++
 examples/sycl/export-lora/cmake_install.cmake |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/finetune.dir/DependInfo.cmake  |   19 +
 .../CMakeFiles/finetune.dir/build.make        |  110 ++
 .../CMakeFiles/finetune.dir/cmake_clean.cmake |   11 +
 .../finetune.dir/compiler_depend.make         |    2 +
 .../finetune.dir/compiler_depend.ts           |    2 +
 .../CMakeFiles/finetune.dir/depend.make       |    2 +
 .../CMakeFiles/finetune.dir/flags.make        |   10 +
 .../finetune/CMakeFiles/finetune.dir/link.txt |    1 +
 .../CMakeFiles/finetune.dir/progress.make     |    3 +
 .../sycl/finetune/CMakeFiles/progress.marks   |    1 +
 examples/sycl/finetune/Makefile               |  222 +++
 examples/sycl/finetune/cmake_install.cmake    |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/infill.dir/DependInfo.cmake    |   19 +
 .../infill/CMakeFiles/infill.dir/build.make   |  110 ++
 .../CMakeFiles/infill.dir/cmake_clean.cmake   |   11 +
 .../infill.dir/compiler_depend.make           |    2 +
 .../CMakeFiles/infill.dir/compiler_depend.ts  |    2 +
 .../infill/CMakeFiles/infill.dir/depend.make  |    2 +
 .../infill/CMakeFiles/infill.dir/flags.make   |   10 +
 .../infill/CMakeFiles/infill.dir/link.txt     |    1 +
 .../CMakeFiles/infill.dir/progress.make       |    3 +
 .../sycl/infill/CMakeFiles/progress.marks     |    1 +
 examples/sycl/infill/Makefile                 |  222 +++
 examples/sycl/infill/cmake_install.cmake      |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../llama-bench.dir/DependInfo.cmake          |   19 +
 .../CMakeFiles/llama-bench.dir/build.make     |  110 ++
 .../llama-bench.dir/cmake_clean.cmake         |   11 +
 .../llama-bench.dir/compiler_depend.make      |    2 +
 .../llama-bench.dir/compiler_depend.ts        |    2 +
 .../CMakeFiles/llama-bench.dir/depend.make    |    2 +
 .../CMakeFiles/llama-bench.dir/flags.make     |   10 +
 .../CMakeFiles/llama-bench.dir/link.txt       |    1 +
 .../CMakeFiles/llama-bench.dir/progress.make  |    3 +
 .../llama-bench/CMakeFiles/progress.marks     |    1 +
 examples/sycl/llama-bench/Makefile            |  222 +++
 examples/sycl/llama-bench/cmake_install.cmake |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/llava-cli.dir/DependInfo.cmake |   19 +
 .../llava/CMakeFiles/llava-cli.dir/build.make |  114 ++
 .../llava-cli.dir/cmake_clean.cmake           |   11 +
 .../llava-cli.dir/compiler_depend.make        |    2 +
 .../llava-cli.dir/compiler_depend.ts          |    2 +
 .../CMakeFiles/llava-cli.dir/depend.make      |    2 +
 .../llava/CMakeFiles/llava-cli.dir/flags.make |   10 +
 .../llava/CMakeFiles/llava-cli.dir/link.txt   |    1 +
 .../CMakeFiles/llava-cli.dir/progress.make    |    3 +
 .../CMakeFiles/llava.dir/DependInfo.cmake     |   20 +
 .../llava/CMakeFiles/llava.dir/build.make     |  116 ++
 .../CMakeFiles/llava.dir/cmake_clean.cmake    |   11 +
 .../CMakeFiles/llava.dir/compiler_depend.make |    2 +
 .../CMakeFiles/llava.dir/compiler_depend.ts   |    2 +
 .../llava/CMakeFiles/llava.dir/depend.make    |    2 +
 .../llava/CMakeFiles/llava.dir/flags.make     |   10 +
 .../llava/CMakeFiles/llava.dir/progress.make  |    3 +
 .../llava_static.dir/DependInfo.cmake         |   18 +
 .../CMakeFiles/llava_static.dir/build.make    |   99 ++
 .../llava_static.dir/cmake_clean.cmake        |    9 +
 .../llava_static.dir/cmake_clean_target.cmake |    3 +
 .../llava_static.dir/compiler_depend.make     |    2 +
 .../llava_static.dir/compiler_depend.ts       |    2 +
 .../CMakeFiles/llava_static.dir/depend.make   |    2 +
 .../CMakeFiles/llava_static.dir/flags.make    |   10 +
 .../CMakeFiles/llava_static.dir/link.txt      |    2 +
 .../CMakeFiles/llava_static.dir/progress.make |    2 +
 examples/sycl/llava/CMakeFiles/progress.marks |    1 +
 examples/sycl/llava/Makefile                  |  288 ++++
 examples/sycl/llava/cmake_install.cmake       |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/lookahead.dir/DependInfo.cmake |   19 +
 .../CMakeFiles/lookahead.dir/build.make       |  110 ++
 .../lookahead.dir/cmake_clean.cmake           |   11 +
 .../lookahead.dir/compiler_depend.make        |    2 +
 .../lookahead.dir/compiler_depend.ts          |    2 +
 .../CMakeFiles/lookahead.dir/depend.make      |    2 +
 .../CMakeFiles/lookahead.dir/flags.make       |   10 +
 .../CMakeFiles/lookahead.dir/link.txt         |    1 +
 .../CMakeFiles/lookahead.dir/progress.make    |    3 +
 .../sycl/lookahead/CMakeFiles/progress.marks  |    1 +
 examples/sycl/lookahead/Makefile              |  222 +++
 examples/sycl/lookahead/cmake_install.cmake   |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/lookup.dir/DependInfo.cmake    |   19 +
 .../lookup/CMakeFiles/lookup.dir/build.make   |  110 ++
 .../CMakeFiles/lookup.dir/cmake_clean.cmake   |   11 +
 .../lookup.dir/compiler_depend.make           |    2 +
 .../CMakeFiles/lookup.dir/compiler_depend.ts  |    2 +
 .../lookup/CMakeFiles/lookup.dir/depend.make  |    2 +
 .../lookup/CMakeFiles/lookup.dir/flags.make   |   10 +
 .../lookup/CMakeFiles/lookup.dir/link.txt     |    1 +
 .../CMakeFiles/lookup.dir/progress.make       |    3 +
 .../sycl/lookup/CMakeFiles/progress.marks     |    1 +
 examples/sycl/lookup/Makefile                 |  222 +++
 examples/sycl/lookup/cmake_install.cmake      |   60 +
 examples/sycl/ls-sycl-device.cpp              |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../main/CMakeFiles/main.dir/DependInfo.cmake |   19 +
 .../sycl/main/CMakeFiles/main.dir/build.make  |  110 ++
 .../CMakeFiles/main.dir/cmake_clean.cmake     |   11 +
 .../CMakeFiles/main.dir/compiler_depend.make  |    2 +
 .../CMakeFiles/main.dir/compiler_depend.ts    |    2 +
 .../sycl/main/CMakeFiles/main.dir/depend.make |    2 +
 .../sycl/main/CMakeFiles/main.dir/flags.make  |   10 +
 .../sycl/main/CMakeFiles/main.dir/link.txt    |    1 +
 .../main/CMakeFiles/main.dir/progress.make    |    3 +
 examples/sycl/main/CMakeFiles/progress.marks  |    1 +
 examples/sycl/main/Makefile                   |  222 +++
 examples/sycl/main/cmake_install.cmake        |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/parallel.dir/DependInfo.cmake  |   19 +
 .../CMakeFiles/parallel.dir/build.make        |  110 ++
 .../CMakeFiles/parallel.dir/cmake_clean.cmake |   11 +
 .../parallel.dir/compiler_depend.make         |    2 +
 .../parallel.dir/compiler_depend.ts           |    2 +
 .../CMakeFiles/parallel.dir/depend.make       |    2 +
 .../CMakeFiles/parallel.dir/flags.make        |   10 +
 .../parallel/CMakeFiles/parallel.dir/link.txt |    1 +
 .../CMakeFiles/parallel.dir/progress.make     |    3 +
 .../sycl/parallel/CMakeFiles/progress.marks   |    1 +
 examples/sycl/parallel/Makefile               |  222 +++
 examples/sycl/parallel/cmake_install.cmake    |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../perplexity.dir/DependInfo.cmake           |   19 +
 .../CMakeFiles/perplexity.dir/build.make      |  110 ++
 .../perplexity.dir/cmake_clean.cmake          |   11 +
 .../perplexity.dir/compiler_depend.make       |    2 +
 .../perplexity.dir/compiler_depend.ts         |    2 +
 .../CMakeFiles/perplexity.dir/depend.make     |    2 +
 .../CMakeFiles/perplexity.dir/flags.make      |   10 +
 .../CMakeFiles/perplexity.dir/link.txt        |    1 +
 .../CMakeFiles/perplexity.dir/progress.make   |    3 +
 .../sycl/perplexity/CMakeFiles/progress.marks |    1 +
 examples/sycl/perplexity/Makefile             |  222 +++
 examples/sycl/perplexity/cmake_install.cmake  |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../quantize-stats/CMakeFiles/progress.marks  |    1 +
 .../quantize-stats.dir/DependInfo.cmake       |   19 +
 .../CMakeFiles/quantize-stats.dir/build.make  |  110 ++
 .../quantize-stats.dir/cmake_clean.cmake      |   11 +
 .../quantize-stats.dir/compiler_depend.make   |    2 +
 .../quantize-stats.dir/compiler_depend.ts     |    2 +
 .../CMakeFiles/quantize-stats.dir/depend.make |    2 +
 .../CMakeFiles/quantize-stats.dir/flags.make  |   10 +
 .../CMakeFiles/quantize-stats.dir/link.txt    |    1 +
 .../quantize-stats.dir/progress.make          |    3 +
 examples/sycl/quantize-stats/Makefile         |  222 +++
 .../sycl/quantize-stats/cmake_install.cmake   |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../sycl/quantize/CMakeFiles/progress.marks   |    1 +
 .../CMakeFiles/quantize.dir/DependInfo.cmake  |   19 +
 .../CMakeFiles/quantize.dir/build.make        |  110 ++
 .../CMakeFiles/quantize.dir/cmake_clean.cmake |   11 +
 .../quantize.dir/compiler_depend.make         |    2 +
 .../quantize.dir/compiler_depend.ts           |    2 +
 .../CMakeFiles/quantize.dir/depend.make       |    2 +
 .../CMakeFiles/quantize.dir/flags.make        |   10 +
 .../quantize/CMakeFiles/quantize.dir/link.txt |    1 +
 .../CMakeFiles/quantize.dir/progress.make     |    3 +
 examples/sycl/quantize/Makefile               |  222 +++
 examples/sycl/quantize/cmake_install.cmake    |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../save-load-state/CMakeFiles/progress.marks |    1 +
 .../save-load-state.dir/DependInfo.cmake      |   19 +
 .../CMakeFiles/save-load-state.dir/build.make |  110 ++
 .../save-load-state.dir/cmake_clean.cmake     |   11 +
 .../save-load-state.dir/compiler_depend.make  |    2 +
 .../save-load-state.dir/compiler_depend.ts    |    2 +
 .../save-load-state.dir/depend.make           |    2 +
 .../CMakeFiles/save-load-state.dir/flags.make |   10 +
 .../CMakeFiles/save-load-state.dir/link.txt   |    1 +
 .../save-load-state.dir/progress.make         |    3 +
 examples/sycl/save-load-state/Makefile        |  222 +++
 .../sycl/save-load-state/cmake_install.cmake  |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../sycl/simple/CMakeFiles/progress.marks     |    1 +
 .../CMakeFiles/simple.dir/DependInfo.cmake    |   19 +
 .../simple/CMakeFiles/simple.dir/build.make   |  110 ++
 .../CMakeFiles/simple.dir/cmake_clean.cmake   |   11 +
 .../simple.dir/compiler_depend.make           |    2 +
 .../CMakeFiles/simple.dir/compiler_depend.ts  |    2 +
 .../simple/CMakeFiles/simple.dir/depend.make  |    2 +
 .../simple/CMakeFiles/simple.dir/flags.make   |   10 +
 .../simple/CMakeFiles/simple.dir/link.txt     |    1 +
 .../CMakeFiles/simple.dir/progress.make       |    3 +
 examples/sycl/simple/Makefile                 |  222 +++
 examples/sycl/simple/cmake_install.cmake      |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../speculative/CMakeFiles/progress.marks     |    1 +
 .../speculative.dir/DependInfo.cmake          |   19 +
 .../CMakeFiles/speculative.dir/build.make     |  110 ++
 .../speculative.dir/cmake_clean.cmake         |   11 +
 .../speculative.dir/compiler_depend.make      |    2 +
 .../speculative.dir/compiler_depend.ts        |    2 +
 .../CMakeFiles/speculative.dir/depend.make    |    2 +
 .../CMakeFiles/speculative.dir/flags.make     |   10 +
 .../CMakeFiles/speculative.dir/link.txt       |    1 +
 .../CMakeFiles/speculative.dir/progress.make  |    3 +
 examples/sycl/speculative/Makefile            |  222 +++
 examples/sycl/speculative/cmake_install.cmake |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../ls-sycl-device.dir/DependInfo.cmake       |   19 +
 .../CMakeFiles/ls-sycl-device.dir/build.make  |  110 ++
 .../ls-sycl-device.dir/cmake_clean.cmake      |   11 +
 .../ls-sycl-device.dir/compiler_depend.make   |    2 +
 .../ls-sycl-device.dir/compiler_depend.ts     |    2 +
 .../CMakeFiles/ls-sycl-device.dir/depend.make |    2 +
 .../CMakeFiles/ls-sycl-device.dir/flags.make  |   10 +
 .../CMakeFiles/ls-sycl-device.dir/link.txt    |    1 +
 .../ls-sycl-device.dir/progress.make          |    3 +
 examples/sycl/sycl/CMakeFiles/progress.marks  |    1 +
 examples/sycl/sycl/Makefile                   |  222 +++
 examples/sycl/sycl/cmake_install.cmake        |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../sycl/tokenize/CMakeFiles/progress.marks   |    1 +
 .../CMakeFiles/tokenize.dir/DependInfo.cmake  |   19 +
 .../CMakeFiles/tokenize.dir/build.make        |  110 ++
 .../CMakeFiles/tokenize.dir/cmake_clean.cmake |   11 +
 .../tokenize.dir/compiler_depend.make         |    2 +
 .../tokenize.dir/compiler_depend.ts           |    2 +
 .../CMakeFiles/tokenize.dir/depend.make       |    2 +
 .../CMakeFiles/tokenize.dir/flags.make        |   10 +
 .../tokenize/CMakeFiles/tokenize.dir/link.txt |    1 +
 .../CMakeFiles/tokenize.dir/progress.make     |    3 +
 examples/sycl/tokenize/Makefile               |  222 +++
 examples/sycl/tokenize/cmake_install.cmake    |   60 +
 .../CMakeDirectoryInformation.cmake           |   16 +
 .../CMakeFiles/progress.marks                 |    1 +
 .../DependInfo.cmake                          |   19 +
 .../train-text-from-scratch.dir/build.make    |  110 ++
 .../cmake_clean.cmake                         |   11 +
 .../compiler_depend.make                      |    2 +
 .../compiler_depend.ts                        |    2 +
 .../train-text-from-scratch.dir/depend.make   |    2 +
 .../train-text-from-scratch.dir/flags.make    |   10 +
 .../train-text-from-scratch.dir/link.txt      |    1 +
 .../train-text-from-scratch.dir/progress.make |    3 +
 .../sycl/train-text-from-scratch/Makefile     |  222 +++
 .../cmake_install.cmake                       |   60 +
 366 files changed, 16343 insertions(+), 10 deletions(-)
 create mode 100644 examples/sycl/CMakeCache.txt
 create mode 100644 examples/sycl/CMakeFiles/3.22.1/CMakeCCompiler.cmake
 create mode 100644 examples/sycl/CMakeFiles/3.22.1/CMakeCXXCompiler.cmake
 create mode 100644 examples/sycl/CMakeFiles/3.22.1/CMakeSystem.cmake
 create mode 100644 examples/sycl/CMakeFiles/3.22.1/CompilerIdC/CMakeCCompilerId.c
 create mode 100755 examples/sycl/CMakeFiles/3.22.1/CompilerIdC/a.out
 create mode 100644 examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/CMakeCXXCompilerId.cpp
 create mode 100755 examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/a.out
 create mode 100644 examples/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/CMakeFiles/Makefile.cmake
 create mode 100644 examples/sycl/CMakeFiles/Makefile2
 create mode 100644 examples/sycl/CMakeFiles/Progress/1
 create mode 100644 examples/sycl/CMakeFiles/Progress/count.txt
 create mode 100644 examples/sycl/CMakeFiles/TargetDirectories.txt
 create mode 100644 examples/sycl/CMakeFiles/cmake.check_cache
 create mode 100644 examples/sycl/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/CMakeLists.txt
 create mode 100644 examples/sycl/Makefile
 create mode 100644 examples/sycl/README.md
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/build.make
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/depend.make
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/flags.make
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/link.txt
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/progress.make
 create mode 100644 examples/sycl/baby-llama/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/baby-llama/Makefile
 create mode 100644 examples/sycl/baby-llama/cmake_install.cmake
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/build.make
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/depend.make
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/flags.make
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/link.txt
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/progress.make
 create mode 100644 examples/sycl/batched-bench/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/batched-bench/Makefile
 create mode 100644 examples/sycl/batched-bench/cmake_install.cmake
 create mode 100644 examples/sycl/batched/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake
 create mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/build.make
 create mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.make
 create mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.ts
 create mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/depend.make
 create mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/flags.make
 create mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/link.txt
 create mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/progress.make
 create mode 100644 examples/sycl/batched/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/batched/Makefile
 create mode 100644 examples/sycl/batched/cmake_install.cmake
 create mode 100644 examples/sycl/beam-search/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake
 create mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/build.make
 create mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.make
 create mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts
 create mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/depend.make
 create mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/flags.make
 create mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/link.txt
 create mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/progress.make
 create mode 100644 examples/sycl/beam-search/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/beam-search/Makefile
 create mode 100644 examples/sycl/beam-search/cmake_install.cmake
 create mode 100644 examples/sycl/benchmark/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake
 create mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/build.make
 create mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.make
 create mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts
 create mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/depend.make
 create mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/flags.make
 create mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/link.txt
 create mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/progress.make
 create mode 100644 examples/sycl/benchmark/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/benchmark/Makefile
 create mode 100644 examples/sycl/benchmark/cmake_install.cmake
 create mode 100644 examples/sycl/cmake_install.cmake
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/Makefile
 create mode 100644 examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake
 create mode 100644 examples/sycl/embedding/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake
 create mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/build.make
 create mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.make
 create mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.ts
 create mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/depend.make
 create mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/flags.make
 create mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/link.txt
 create mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/progress.make
 create mode 100644 examples/sycl/embedding/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/embedding/Makefile
 create mode 100644 examples/sycl/embedding/cmake_install.cmake
 create mode 100644 examples/sycl/export-lora/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake
 create mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/build.make
 create mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.make
 create mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts
 create mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/depend.make
 create mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/flags.make
 create mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/link.txt
 create mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/progress.make
 create mode 100644 examples/sycl/export-lora/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/export-lora/Makefile
 create mode 100644 examples/sycl/export-lora/cmake_install.cmake
 create mode 100644 examples/sycl/finetune/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake
 create mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/build.make
 create mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.make
 create mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.ts
 create mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/depend.make
 create mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/flags.make
 create mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/link.txt
 create mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/progress.make
 create mode 100644 examples/sycl/finetune/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/finetune/Makefile
 create mode 100644 examples/sycl/finetune/cmake_install.cmake
 create mode 100644 examples/sycl/infill/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake
 create mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/build.make
 create mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.make
 create mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.ts
 create mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/depend.make
 create mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/flags.make
 create mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/link.txt
 create mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/progress.make
 create mode 100644 examples/sycl/infill/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/infill/Makefile
 create mode 100644 examples/sycl/infill/cmake_install.cmake
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/build.make
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/depend.make
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/flags.make
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/link.txt
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/progress.make
 create mode 100644 examples/sycl/llama-bench/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/llama-bench/Makefile
 create mode 100644 examples/sycl/llama-bench/cmake_install.cmake
 create mode 100644 examples/sycl/llava/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake
 create mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/build.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.ts
 create mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/depend.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/flags.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/link.txt
 create mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/progress.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake
 create mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/build.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.ts
 create mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/depend.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/flags.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/progress.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/build.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean_target.cmake
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.ts
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/depend.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/flags.make
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/link.txt
 create mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/progress.make
 create mode 100644 examples/sycl/llava/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/llava/Makefile
 create mode 100644 examples/sycl/llava/cmake_install.cmake
 create mode 100644 examples/sycl/lookahead/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake
 create mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/build.make
 create mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.make
 create mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts
 create mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/depend.make
 create mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/flags.make
 create mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/link.txt
 create mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/progress.make
 create mode 100644 examples/sycl/lookahead/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/lookahead/Makefile
 create mode 100644 examples/sycl/lookahead/cmake_install.cmake
 create mode 100644 examples/sycl/lookup/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake
 create mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/build.make
 create mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.make
 create mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.ts
 create mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/depend.make
 create mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/flags.make
 create mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/link.txt
 create mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/progress.make
 create mode 100644 examples/sycl/lookup/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/lookup/Makefile
 create mode 100644 examples/sycl/lookup/cmake_install.cmake
 create mode 100644 examples/sycl/ls-sycl-device.cpp
 create mode 100644 examples/sycl/main/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake
 create mode 100644 examples/sycl/main/CMakeFiles/main.dir/build.make
 create mode 100644 examples/sycl/main/CMakeFiles/main.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/main/CMakeFiles/main.dir/compiler_depend.make
 create mode 100644 examples/sycl/main/CMakeFiles/main.dir/compiler_depend.ts
 create mode 100644 examples/sycl/main/CMakeFiles/main.dir/depend.make
 create mode 100644 examples/sycl/main/CMakeFiles/main.dir/flags.make
 create mode 100644 examples/sycl/main/CMakeFiles/main.dir/link.txt
 create mode 100644 examples/sycl/main/CMakeFiles/main.dir/progress.make
 create mode 100644 examples/sycl/main/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/main/Makefile
 create mode 100644 examples/sycl/main/cmake_install.cmake
 create mode 100644 examples/sycl/parallel/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake
 create mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/build.make
 create mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.make
 create mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.ts
 create mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/depend.make
 create mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/flags.make
 create mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/link.txt
 create mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/progress.make
 create mode 100644 examples/sycl/parallel/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/parallel/Makefile
 create mode 100644 examples/sycl/parallel/cmake_install.cmake
 create mode 100644 examples/sycl/perplexity/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake
 create mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/build.make
 create mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.make
 create mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts
 create mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/depend.make
 create mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/flags.make
 create mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/link.txt
 create mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/progress.make
 create mode 100644 examples/sycl/perplexity/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/perplexity/Makefile
 create mode 100644 examples/sycl/perplexity/cmake_install.cmake
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/build.make
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/depend.make
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/link.txt
 create mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/progress.make
 create mode 100644 examples/sycl/quantize-stats/Makefile
 create mode 100644 examples/sycl/quantize-stats/cmake_install.cmake
 create mode 100644 examples/sycl/quantize/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/quantize/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake
 create mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/build.make
 create mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.make
 create mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.ts
 create mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/depend.make
 create mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/flags.make
 create mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/link.txt
 create mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/progress.make
 create mode 100644 examples/sycl/quantize/Makefile
 create mode 100644 examples/sycl/quantize/cmake_install.cmake
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/build.make
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/depend.make
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/flags.make
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/link.txt
 create mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/progress.make
 create mode 100644 examples/sycl/save-load-state/Makefile
 create mode 100644 examples/sycl/save-load-state/cmake_install.cmake
 create mode 100644 examples/sycl/simple/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/simple/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake
 create mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/build.make
 create mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.make
 create mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.ts
 create mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/depend.make
 create mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/flags.make
 create mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/link.txt
 create mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/progress.make
 create mode 100644 examples/sycl/simple/Makefile
 create mode 100644 examples/sycl/simple/cmake_install.cmake
 create mode 100644 examples/sycl/speculative/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/speculative/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake
 create mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/build.make
 create mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.make
 create mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.ts
 create mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/depend.make
 create mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/flags.make
 create mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/link.txt
 create mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/progress.make
 create mode 100644 examples/sycl/speculative/Makefile
 create mode 100644 examples/sycl/speculative/cmake_install.cmake
 create mode 100644 examples/sycl/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake
 create mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/build.make
 create mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make
 create mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts
 create mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/depend.make
 create mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/flags.make
 create mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/link.txt
 create mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/progress.make
 create mode 100644 examples/sycl/sycl/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/sycl/Makefile
 create mode 100644 examples/sycl/sycl/cmake_install.cmake
 create mode 100644 examples/sycl/tokenize/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/tokenize/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake
 create mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/build.make
 create mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.make
 create mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts
 create mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/depend.make
 create mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/flags.make
 create mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/link.txt
 create mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/progress.make
 create mode 100644 examples/sycl/tokenize/Makefile
 create mode 100644 examples/sycl/tokenize/cmake_install.cmake
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/progress.marks
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt
 create mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make
 create mode 100644 examples/sycl/train-text-from-scratch/Makefile
 create mode 100644 examples/sycl/train-text-from-scratch/cmake_install.cmake

diff --git a/README_sycl.md b/README_sycl.md
index 932b0e9293724..ddd7de3fcf3f9 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -129,11 +129,23 @@ or
 
 ### Run
 
-1. List device ID
+1. Put model file to folder **models**
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. List device ID
 
 Run without parameter:
 
 ```
+./build/bin/ls-sycl-device
+
+or
+
 ./build/bin/main
 ```
 
@@ -157,14 +169,6 @@ found 4 SYCL devices:
 |compute capability 1.3|Level-zero running time, recommended |
 |compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
 
-2. Put model file to folder **models**
-
-3. Enable oneAPI running environment
-
-```
-source /opt/intel/oneapi/setvars.sh
-```
-
 4. Set device ID and execute llama.cpp
 
 Set device ID = 0 by **GGML_SYCL_DEVICE=0**
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index f67d74c5530c9..e9c738985fd4a 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,6 +23,7 @@ else()
     add_subdirectory(infill)
     add_subdirectory(llama-bench)
     add_subdirectory(llava)
+    add_subdirectory(sycl)
     add_subdirectory(main)
     add_subdirectory(tokenize)
     add_subdirectory(parallel)
@@ -34,7 +35,7 @@ else()
     add_subdirectory(passkey)
     add_subdirectory(speculative)
     add_subdirectory(lookahead)
-    add_subdirectory(lookup)
+    add_subdirectory(lookup)    
     add_subdirectory(train-text-from-scratch)
     add_subdirectory(imatrix)
     if (LLAMA_BUILD_SERVER)
diff --git a/examples/sycl/CMakeCache.txt b/examples/sycl/CMakeCache.txt
new file mode 100644
index 0000000000000..c8998b4ac8f15
--- /dev/null
+++ b/examples/sycl/CMakeCache.txt
@@ -0,0 +1,369 @@
+# This is the CMakeCache file.
+# For build in directory: /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+# It was generated by CMake: /usr/bin/cmake
+# You can edit this file to change values found and used by cmake.
+# If you do not want to change any of the values, simply exit the editor.
+# If you do want to change a value, simply edit, save, and exit the editor.
+# The syntax for the file is as follows:
+# KEY:TYPE=VALUE
+# KEY is the name of a variable in the cache.
+# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
+# VALUE is the current value for the KEY.
+
+########################
+# EXTERNAL cache entries
+########################
+
+//Path to a program.
+CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
+
+//Path to a program.
+CMAKE_AR:FILEPATH=/usr/bin/ar
+
+//For backwards compatibility, what version of CMake commands and
+// syntax should this version of CMake try to support.
+CMAKE_BACKWARDS_COMPATIBILITY:STRING=2.4
+
+//Choose the type of build, options are: None Debug Release RelWithDebInfo
+// MinSizeRel ...
+CMAKE_BUILD_TYPE:STRING=
+
+//Enable/Disable color output during build.
+CMAKE_COLOR_MAKEFILE:BOOL=ON
+
+//CXX compiler
+CMAKE_CXX_COMPILER:STRING=/opt/intel/oneapi/compiler/2024.0/bin/icpx
+
+//Flags used by the CXX compiler during all build types.
+CMAKE_CXX_FLAGS:STRING=
+
+//Flags used by the CXX compiler during DEBUG builds.
+CMAKE_CXX_FLAGS_DEBUG:STRING=-g
+
+//Flags used by the CXX compiler during MINSIZEREL builds.
+CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
+
+//Flags used by the CXX compiler during RELEASE builds.
+CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
+
+//Flags used by the CXX compiler during RELWITHDEBINFO builds.
+CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
+
+//C compiler
+CMAKE_C_COMPILER:STRING=/opt/intel/oneapi/compiler/2024.0/bin/icx
+
+//Flags used by the C compiler during all build types.
+CMAKE_C_FLAGS:STRING=
+
+//Flags used by the C compiler during DEBUG builds.
+CMAKE_C_FLAGS_DEBUG:STRING=-g
+
+//Flags used by the C compiler during MINSIZEREL builds.
+CMAKE_C_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
+
+//Flags used by the C compiler during RELEASE builds.
+CMAKE_C_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
+
+//Flags used by the C compiler during RELWITHDEBINFO builds.
+CMAKE_C_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
+
+//Path to a program.
+CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
+
+//Flags used by the linker during all build types.
+CMAKE_EXE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during DEBUG builds.
+CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during MINSIZEREL builds.
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during RELEASE builds.
+CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during RELWITHDEBINFO builds.
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Enable/Disable output of compile commands during generation.
+CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
+
+//Install path prefix, prepended onto install directories.
+CMAKE_INSTALL_PREFIX:PATH=/usr/local
+
+//Path to a program.
+CMAKE_LINKER:FILEPATH=/usr/bin/ld
+
+//Path to a program.
+CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/gmake
+
+//Flags used by the linker during the creation of modules during
+// all build types.
+CMAKE_MODULE_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of modules during
+// DEBUG builds.
+CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of modules during
+// MINSIZEREL builds.
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELEASE builds.
+CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of modules during
+// RELWITHDEBINFO builds.
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_NM:FILEPATH=/usr/bin/nm
+
+//Path to a program.
+CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
+
+//Path to a program.
+CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
+
+//Value Computed by CMake
+CMAKE_PROJECT_DESCRIPTION:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
+
+//Value Computed by CMake
+CMAKE_PROJECT_NAME:STATIC=Project
+
+//Path to a program.
+CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
+
+//Path to a program.
+CMAKE_READELF:FILEPATH=/usr/bin/readelf
+
+//Flags used by the linker during the creation of shared libraries
+// during all build types.
+CMAKE_SHARED_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during DEBUG builds.
+CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during MINSIZEREL builds.
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELEASE builds.
+CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of shared libraries
+// during RELWITHDEBINFO builds.
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//If set, runtime paths are not added when installing shared libraries,
+// but are added when building.
+CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
+
+//If set, runtime paths are not added when using shared libraries.
+CMAKE_SKIP_RPATH:BOOL=NO
+
+//Flags used by the linker during the creation of static libraries
+// during all build types.
+CMAKE_STATIC_LINKER_FLAGS:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during DEBUG builds.
+CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during MINSIZEREL builds.
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELEASE builds.
+CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
+
+//Flags used by the linker during the creation of static libraries
+// during RELWITHDEBINFO builds.
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
+
+//Path to a program.
+CMAKE_STRIP:FILEPATH=/usr/bin/strip
+
+//If this value is on, makefiles will be generated without the
+// .SILENT directive, and all commands will be echoed to the console
+// during the make.  This is useful for debugging only. With Visual
+// Studio IDE projects all commands are done without /nologo.
+CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
+
+//Single output directory for building all executables.
+EXECUTABLE_OUTPUT_PATH:PATH=
+
+//Single output directory for building all libraries.
+LIBRARY_OUTPUT_PATH:PATH=
+
+//No help, variable specified on the command line.
+LLAMA_SYCL:UNINITIALIZED=ON
+
+//Value Computed by CMake
+Project_BINARY_DIR:STATIC=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+//Value Computed by CMake
+Project_IS_TOP_LEVEL:STATIC=ON
+
+//Value Computed by CMake
+Project_SOURCE_DIR:STATIC=/home/jianyuzh/ws/llama.cpp/develop/examples
+
+//Path to a program.
+XIAR:FILEPATH=/opt/intel/oneapi/compiler/2024.0/bin/xiar
+
+
+########################
+# INTERNAL cache entries
+########################
+
+//ADVANCED property for variable: CMAKE_ADDR2LINE
+CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_AR
+CMAKE_AR-ADVANCED:INTERNAL=1
+//This is the directory where this CMakeCache.txt was created
+CMAKE_CACHEFILE_DIR:INTERNAL=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+//Major version of cmake used to create the current loaded cache
+CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
+//Minor version of cmake used to create the current loaded cache
+CMAKE_CACHE_MINOR_VERSION:INTERNAL=22
+//Patch version of cmake used to create the current loaded cache
+CMAKE_CACHE_PATCH_VERSION:INTERNAL=1
+//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
+CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
+//Path to CMake executable.
+CMAKE_COMMAND:INTERNAL=/usr/bin/cmake
+//Path to cpack program executable.
+CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack
+//Path to ctest program executable.
+CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest
+//ADVANCED property for variable: CMAKE_CXX_COMPILER
+CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS
+CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
+CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
+CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
+CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
+CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_COMPILER
+CMAKE_C_COMPILER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS
+CMAKE_C_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_DEBUG
+CMAKE_C_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_MINSIZEREL
+CMAKE_C_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_RELEASE
+CMAKE_C_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_C_FLAGS_RELWITHDEBINFO
+CMAKE_C_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_DLLTOOL
+CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
+//Executable file format
+CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
+CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
+CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
+CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
+CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
+CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
+//Name of external makefile project generator.
+CMAKE_EXTRA_GENERATOR:INTERNAL=
+//Name of generator.
+CMAKE_GENERATOR:INTERNAL=Unix Makefiles
+//Generator instance identifier.
+CMAKE_GENERATOR_INSTANCE:INTERNAL=
+//Name of generator platform.
+CMAKE_GENERATOR_PLATFORM:INTERNAL=
+//Name of generator toolset.
+CMAKE_GENERATOR_TOOLSET:INTERNAL=
+//Test CMAKE_HAVE_LIBC_PTHREAD
+CMAKE_HAVE_LIBC_PTHREAD:INTERNAL=1
+//Have include pthread.h
+CMAKE_HAVE_PTHREAD_H:INTERNAL=1
+//Source directory with the top level CMakeLists.txt file for this
+// project
+CMAKE_HOME_DIRECTORY:INTERNAL=/home/jianyuzh/ws/llama.cpp/develop/examples
+//Install .so files without execute permission.
+CMAKE_INSTALL_SO_NO_EXE:INTERNAL=1
+//ADVANCED property for variable: CMAKE_LINKER
+CMAKE_LINKER-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
+CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
+CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
+CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
+CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
+CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_NM
+CMAKE_NM-ADVANCED:INTERNAL=1
+//number of local generators
+CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=26
+//ADVANCED property for variable: CMAKE_OBJCOPY
+CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_OBJDUMP
+CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
+//Platform information initialized
+CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_RANLIB
+CMAKE_RANLIB-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_READELF
+CMAKE_READELF-ADVANCED:INTERNAL=1
+//Path to CMake installation.
+CMAKE_ROOT:INTERNAL=/usr/share/cmake-3.22
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
+CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
+CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
+CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
+CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
+CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_SKIP_RPATH
+CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
+CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
+CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
+CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
+CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
+CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
+//ADVANCED property for variable: CMAKE_STRIP
+CMAKE_STRIP-ADVANCED:INTERNAL=1
+//uname command
+CMAKE_UNAME:INTERNAL=/usr/bin/uname
+//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
+CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
+//Details about finding Threads
+FIND_PACKAGE_MESSAGE_DETAILS_Threads:INTERNAL=[TRUE][v()]
+//ADVANCED property for variable: XIAR
+XIAR-ADVANCED:INTERNAL=1
+
diff --git a/examples/sycl/CMakeFiles/3.22.1/CMakeCCompiler.cmake b/examples/sycl/CMakeFiles/3.22.1/CMakeCCompiler.cmake
new file mode 100644
index 0000000000000..0e4cc221817c5
--- /dev/null
+++ b/examples/sycl/CMakeFiles/3.22.1/CMakeCCompiler.cmake
@@ -0,0 +1,72 @@
+set(CMAKE_C_COMPILER "/opt/intel/oneapi/compiler/2024.0/bin/icx")
+set(CMAKE_C_COMPILER_ARG1 "")
+set(CMAKE_C_COMPILER_ID "IntelLLVM")
+set(CMAKE_C_COMPILER_VERSION "2024.0.0")
+set(CMAKE_C_COMPILER_VERSION_INTERNAL "")
+set(CMAKE_C_COMPILER_WRAPPER "")
+set(CMAKE_C_STANDARD_COMPUTED_DEFAULT "17")
+set(CMAKE_C_EXTENSIONS_COMPUTED_DEFAULT "ON")
+set(CMAKE_C_COMPILE_FEATURES "c_std_90;c_function_prototypes;c_std_99;c_restrict;c_variadic_macros;c_std_11;c_static_assert;c_std_17")
+set(CMAKE_C90_COMPILE_FEATURES "c_std_90;c_function_prototypes")
+set(CMAKE_C99_COMPILE_FEATURES "c_std_99;c_restrict;c_variadic_macros")
+set(CMAKE_C11_COMPILE_FEATURES "c_std_11;c_static_assert")
+set(CMAKE_C17_COMPILE_FEATURES "c_std_17")
+set(CMAKE_C23_COMPILE_FEATURES "")
+
+set(CMAKE_C_PLATFORM_ID "Linux")
+set(CMAKE_C_SIMULATE_ID "GNU")
+set(CMAKE_C_COMPILER_FRONTEND_VARIANT "GNU")
+set(CMAKE_C_SIMULATE_VERSION "4.2.1")
+
+
+
+
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_C_COMPILER_AR "")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_C_COMPILER_RANLIB "")
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_MT "")
+set(CMAKE_COMPILER_IS_GNUCC )
+set(CMAKE_C_COMPILER_LOADED 1)
+set(CMAKE_C_COMPILER_WORKS TRUE)
+set(CMAKE_C_ABI_COMPILED TRUE)
+
+set(CMAKE_C_COMPILER_ENV_VAR "CC")
+
+set(CMAKE_C_COMPILER_ID_RUN 1)
+set(CMAKE_C_SOURCE_FILE_EXTENSIONS c;m)
+set(CMAKE_C_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
+set(CMAKE_C_LINKER_PREFERENCE 10)
+
+# Save compiler ABI information.
+set(CMAKE_C_SIZEOF_DATA_PTR "8")
+set(CMAKE_C_COMPILER_ABI "ELF")
+set(CMAKE_C_BYTE_ORDER "LITTLE_ENDIAN")
+set(CMAKE_C_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
+
+if(CMAKE_C_SIZEOF_DATA_PTR)
+  set(CMAKE_SIZEOF_VOID_P "${CMAKE_C_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_C_COMPILER_ABI)
+  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_C_COMPILER_ABI}")
+endif()
+
+if(CMAKE_C_LIBRARY_ARCHITECTURE)
+  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
+endif()
+
+set(CMAKE_C_CL_SHOWINCLUDES_PREFIX "")
+if(CMAKE_C_CL_SHOWINCLUDES_PREFIX)
+  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_C_CL_SHOWINCLUDES_PREFIX}")
+endif()
+
+
+
+
+
+set(CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES "/opt/intel/oneapi/tbb/2021.11/include;/opt/intel/oneapi/mpi/2021.11/include;/opt/intel/oneapi/mkl/2024.0/include;/opt/intel/oneapi/ippcp/2021.9/include;/opt/intel/oneapi/ipp/2021.10/include;/opt/intel/oneapi/dpl/2022.3/include;/opt/intel/oneapi/dpcpp-ct/2024.0/include;/opt/intel/oneapi/dnnl/2024.0/include;/opt/intel/oneapi/dev-utilities/2024.0/include;/opt/intel/oneapi/dal/2024.0/include/dal;/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/include;/opt/intel/oneapi/ccl/2021.11/include;/opt/intel/oneapi/compiler/2024.0/opt/compiler/include;/opt/intel/oneapi/compiler/2024.0/lib/clang/17/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include")
+set(CMAKE_C_IMPLICIT_LINK_LIBRARIES "svml;irng;imf;m;gcc;gcc_s;irc;dl;gcc;gcc_s;c;gcc;gcc_s;irc_s")
+set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "/opt/intel/oneapi/compiler/2024.0/lib;/opt/intel/oneapi/compiler/2024.0/lib/clang/17/lib/x86_64-unknown-linux-gnu;/usr/lib/gcc/x86_64-linux-gnu/11;/usr/lib64;/lib/x86_64-linux-gnu;/lib64;/usr/lib/x86_64-linux-gnu;/usr/lib;/opt/intel/oneapi/compiler/2024.0/opt/compiler/lib;/lib;/opt/intel/oneapi/tbb/2021.11/lib/intel64/gcc4.8;/opt/intel/oneapi/mpi/2021.11/lib;/opt/intel/oneapi/mkl/2024.0/lib;/opt/intel/oneapi/ippcp/2021.9/lib;/opt/intel/oneapi/ipp/2021.10/lib;/opt/intel/oneapi/dpl/2022.3/lib;/opt/intel/oneapi/dnnl/2024.0/lib;/opt/intel/oneapi/dal/2024.0/lib;/opt/intel/oneapi/ccl/2021.11/lib")
+set(CMAKE_C_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/examples/sycl/CMakeFiles/3.22.1/CMakeCXXCompiler.cmake b/examples/sycl/CMakeFiles/3.22.1/CMakeCXXCompiler.cmake
new file mode 100644
index 0000000000000..37999919458bd
--- /dev/null
+++ b/examples/sycl/CMakeFiles/3.22.1/CMakeCXXCompiler.cmake
@@ -0,0 +1,83 @@
+set(CMAKE_CXX_COMPILER "/opt/intel/oneapi/compiler/2024.0/bin/icpx")
+set(CMAKE_CXX_COMPILER_ARG1 "")
+set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
+set(CMAKE_CXX_COMPILER_VERSION "2024.0.0")
+set(CMAKE_CXX_COMPILER_VERSION_INTERNAL "")
+set(CMAKE_CXX_COMPILER_WRAPPER "")
+set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "17")
+set(CMAKE_CXX_EXTENSIONS_COMPUTED_DEFAULT "ON")
+set(CMAKE_CXX_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters;cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates;cxx_std_17;cxx_std_20")
+set(CMAKE_CXX98_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters")
+set(CMAKE_CXX11_COMPILE_FEATURES "cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
+set(CMAKE_CXX14_COMPILE_FEATURES "cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
+set(CMAKE_CXX17_COMPILE_FEATURES "cxx_std_17")
+set(CMAKE_CXX20_COMPILE_FEATURES "cxx_std_20")
+set(CMAKE_CXX23_COMPILE_FEATURES "")
+
+set(CMAKE_CXX_PLATFORM_ID "Linux")
+set(CMAKE_CXX_SIMULATE_ID "GNU")
+set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "GNU")
+set(CMAKE_CXX_SIMULATE_VERSION "4.2.1")
+
+
+
+
+set(CMAKE_AR "/usr/bin/ar")
+set(CMAKE_CXX_COMPILER_AR "")
+set(CMAKE_RANLIB "/usr/bin/ranlib")
+set(CMAKE_CXX_COMPILER_RANLIB "")
+set(CMAKE_LINKER "/usr/bin/ld")
+set(CMAKE_MT "")
+set(CMAKE_COMPILER_IS_GNUCXX )
+set(CMAKE_CXX_COMPILER_LOADED 1)
+set(CMAKE_CXX_COMPILER_WORKS TRUE)
+set(CMAKE_CXX_ABI_COMPILED TRUE)
+
+set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
+
+set(CMAKE_CXX_COMPILER_ID_RUN 1)
+set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;m;mm;mpp;CPP;ixx;cppm)
+set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
+
+foreach (lang C OBJC OBJCXX)
+  if (CMAKE_${lang}_COMPILER_ID_RUN)
+    foreach(extension IN LISTS CMAKE_${lang}_SOURCE_FILE_EXTENSIONS)
+      list(REMOVE_ITEM CMAKE_CXX_SOURCE_FILE_EXTENSIONS ${extension})
+    endforeach()
+  endif()
+endforeach()
+
+set(CMAKE_CXX_LINKER_PREFERENCE 30)
+set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
+
+# Save compiler ABI information.
+set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
+set(CMAKE_CXX_COMPILER_ABI "ELF")
+set(CMAKE_CXX_BYTE_ORDER "LITTLE_ENDIAN")
+set(CMAKE_CXX_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
+
+if(CMAKE_CXX_SIZEOF_DATA_PTR)
+  set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
+endif()
+
+if(CMAKE_CXX_COMPILER_ABI)
+  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
+endif()
+
+if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
+  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
+endif()
+
+set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
+if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
+  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
+endif()
+
+
+
+
+
+set(CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES "/opt/intel/oneapi/tbb/2021.11/include;/opt/intel/oneapi/mpi/2021.11/include;/opt/intel/oneapi/mkl/2024.0/include;/opt/intel/oneapi/ippcp/2021.9/include;/opt/intel/oneapi/ipp/2021.10/include;/opt/intel/oneapi/dpl/2022.3/include;/opt/intel/oneapi/dpcpp-ct/2024.0/include;/opt/intel/oneapi/dnnl/2024.0/include;/opt/intel/oneapi/dev-utilities/2024.0/include;/opt/intel/oneapi/dal/2024.0/include/dal;/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/include;/opt/intel/oneapi/ccl/2021.11/include;/opt/intel/oneapi/compiler/2024.0/opt/compiler/include;/usr/include/c++/11;/usr/include/x86_64-linux-gnu/c++/11;/usr/include/c++/11/backward;/opt/intel/oneapi/compiler/2024.0/lib/clang/17/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include")
+set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "svml;irng;stdc++;imf;m;gcc_s;gcc;irc;dl;gcc_s;gcc;c;gcc_s;gcc;irc_s")
+set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/opt/intel/oneapi/compiler/2024.0/lib;/opt/intel/oneapi/compiler/2024.0/lib/clang/17/lib/x86_64-unknown-linux-gnu;/usr/lib/gcc/x86_64-linux-gnu/11;/usr/lib64;/lib/x86_64-linux-gnu;/lib64;/usr/lib/x86_64-linux-gnu;/usr/lib;/opt/intel/oneapi/compiler/2024.0/opt/compiler/lib;/lib;/opt/intel/oneapi/tbb/2021.11/lib/intel64/gcc4.8;/opt/intel/oneapi/mpi/2021.11/lib;/opt/intel/oneapi/mkl/2024.0/lib;/opt/intel/oneapi/ippcp/2021.9/lib;/opt/intel/oneapi/ipp/2021.10/lib;/opt/intel/oneapi/dpl/2022.3/lib;/opt/intel/oneapi/dnnl/2024.0/lib;/opt/intel/oneapi/dal/2024.0/lib;/opt/intel/oneapi/ccl/2021.11/lib")
+set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/examples/sycl/CMakeFiles/3.22.1/CMakeSystem.cmake b/examples/sycl/CMakeFiles/3.22.1/CMakeSystem.cmake
new file mode 100644
index 0000000000000..21d50c58e038f
--- /dev/null
+++ b/examples/sycl/CMakeFiles/3.22.1/CMakeSystem.cmake
@@ -0,0 +1,15 @@
+set(CMAKE_HOST_SYSTEM "Linux-6.2.0-39-generic")
+set(CMAKE_HOST_SYSTEM_NAME "Linux")
+set(CMAKE_HOST_SYSTEM_VERSION "6.2.0-39-generic")
+set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
+
+
+
+set(CMAKE_SYSTEM "Linux-6.2.0-39-generic")
+set(CMAKE_SYSTEM_NAME "Linux")
+set(CMAKE_SYSTEM_VERSION "6.2.0-39-generic")
+set(CMAKE_SYSTEM_PROCESSOR "x86_64")
+
+set(CMAKE_CROSSCOMPILING "FALSE")
+
+set(CMAKE_SYSTEM_LOADED 1)
diff --git a/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/CMakeCCompilerId.c b/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/CMakeCCompilerId.c
new file mode 100644
index 0000000000000..41b99d7783c1e
--- /dev/null
+++ b/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/CMakeCCompilerId.c
@@ -0,0 +1,803 @@
+#ifdef __cplusplus
+# error "A C++ compiler has been selected for C."
+#endif
+
+#if defined(__18CXX)
+# define ID_VOID_MAIN
+#endif
+#if defined(__CLASSIC_C__)
+/* cv-qualifiers did not exist in K&R C */
+# define const
+# define volatile
+#endif
+
+#if !defined(__has_include)
+/* If the compiler does not have __has_include, pretend the answer is
+   always no.  */
+#  define __has_include(x) 0
+#endif
+
+
+/* Version number components: V=Version, R=Revision, P=Patch
+   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
+
+#if defined(__INTEL_COMPILER) || defined(__ICC)
+# define COMPILER_ID "Intel"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_ID "GNU"
+# endif
+  /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
+     except that a few beta releases use the old format with V=2021.  */
+# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
+#  if defined(__INTEL_COMPILER_UPDATE)
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
+#  else
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
+#  endif
+# else
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
+   /* The third version component from --version is an update index,
+      but no macro is provided for it.  */
+#  define COMPILER_VERSION_PATCH DEC(0)
+# endif
+# if defined(__INTEL_COMPILER_BUILD_DATE)
+   /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
+#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
+# endif
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+# elif defined(__GNUG__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
+# define COMPILER_ID "IntelLLVM"
+#if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_ID "GNU"
+#endif
+/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
+ * later.  Look for 6 digit vs. 8 digit version number to decide encoding.
+ * VVVV is no smaller than the current year when a version is released.
+ */
+#if __INTEL_LLVM_COMPILER < 1000000L
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER    % 10)
+#else
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER     % 100)
+#endif
+#if defined(_MSC_VER)
+  /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+#elif defined(__GNUG__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+#endif
+#if defined(__GNUC_MINOR__)
+# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+#endif
+#if defined(__GNUC_PATCHLEVEL__)
+# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+#endif
+
+#elif defined(__PATHCC__)
+# define COMPILER_ID "PathScale"
+# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
+# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
+# if defined(__PATHCC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
+# endif
+
+#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
+# define COMPILER_ID "Embarcadero"
+# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
+# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
+# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
+
+#elif defined(__BORLANDC__)
+# define COMPILER_ID "Borland"
+  /* __BORLANDC__ = 0xVRR */
+# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
+# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
+
+#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
+# define COMPILER_ID "Watcom"
+   /* __WATCOMC__ = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__WATCOMC__)
+# define COMPILER_ID "OpenWatcom"
+   /* __WATCOMC__ = VVRP + 1100 */
+# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__SUNPRO_C)
+# define COMPILER_ID "SunPro"
+# if __SUNPRO_C >= 0x5100
+   /* __SUNPRO_C = 0xVRRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>12)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xFF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_C    & 0xF)
+# else
+   /* __SUNPRO_CC = 0xVRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>8)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_C    & 0xF)
+# endif
+
+#elif defined(__HP_cc)
+# define COMPILER_ID "HP"
+  /* __HP_cc = VVRRPP */
+# define COMPILER_VERSION_MAJOR DEC(__HP_cc/10000)
+# define COMPILER_VERSION_MINOR DEC(__HP_cc/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__HP_cc     % 100)
+
+#elif defined(__DECC)
+# define COMPILER_ID "Compaq"
+  /* __DECC_VER = VVRRTPPPP */
+# define COMPILER_VERSION_MAJOR DEC(__DECC_VER/10000000)
+# define COMPILER_VERSION_MINOR DEC(__DECC_VER/100000  % 100)
+# define COMPILER_VERSION_PATCH DEC(__DECC_VER         % 10000)
+
+#elif defined(__IBMC__) && defined(__COMPILER_VER__)
+# define COMPILER_ID "zOS"
+  /* __IBMC__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
+
+#elif defined(__ibmxl__) && defined(__clang__)
+# define COMPILER_ID "XLClang"
+# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
+# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
+# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
+
+
+#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ >= 800
+# define COMPILER_ID "XL"
+  /* __IBMC__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
+
+#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ < 800
+# define COMPILER_ID "VisualAge"
+  /* __IBMC__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
+
+#elif defined(__NVCOMPILER)
+# define COMPILER_ID "NVHPC"
+# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
+# if defined(__NVCOMPILER_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
+# endif
+
+#elif defined(__PGI)
+# define COMPILER_ID "PGI"
+# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
+# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
+# if defined(__PGIC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
+# endif
+
+#elif defined(_CRAYC)
+# define COMPILER_ID "Cray"
+# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
+# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
+
+#elif defined(__TI_COMPILER_VERSION__)
+# define COMPILER_ID "TI"
+  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
+# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
+# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
+# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
+
+#elif defined(__CLANG_FUJITSU)
+# define COMPILER_ID "FujitsuClang"
+# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# define COMPILER_VERSION_INTERNAL_STR __clang_version__
+
+
+#elif defined(__FUJITSU)
+# define COMPILER_ID "Fujitsu"
+# if defined(__FCC_version__)
+#   define COMPILER_VERSION __FCC_version__
+# elif defined(__FCC_major__)
+#   define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+#   define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+#   define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# endif
+# if defined(__fcc_version)
+#   define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
+# elif defined(__FCC_VERSION)
+#   define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
+# endif
+
+
+#elif defined(__ghs__)
+# define COMPILER_ID "GHS"
+/* __GHS_VERSION_NUMBER = VVVVRP */
+# ifdef __GHS_VERSION_NUMBER
+# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
+# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
+# endif
+
+#elif defined(__TINYC__)
+# define COMPILER_ID "TinyCC"
+
+#elif defined(__BCC__)
+# define COMPILER_ID "Bruce"
+
+#elif defined(__SCO_VERSION__)
+# define COMPILER_ID "SCO"
+
+#elif defined(__ARMCC_VERSION) && !defined(__clang__)
+# define COMPILER_ID "ARMCC"
+#if __ARMCC_VERSION >= 1000000
+  /* __ARMCC_VERSION = VRRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
+#else
+  /* __ARMCC_VERSION = VRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
+#endif
+
+
+#elif defined(__clang__) && defined(__apple_build_version__)
+# define COMPILER_ID "AppleClang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
+
+#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
+# define COMPILER_ID "ARMClang"
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION     % 10000)
+# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
+
+#elif defined(__clang__)
+# define COMPILER_ID "Clang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+#elif defined(__GNUC__)
+# define COMPILER_ID "GNU"
+# define COMPILER_VERSION_MAJOR DEC(__GNUC__)
+# if defined(__GNUC_MINOR__)
+#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif defined(_MSC_VER)
+# define COMPILER_ID "MSVC"
+  /* _MSC_VER = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
+# if defined(_MSC_FULL_VER)
+#  if _MSC_VER >= 1400
+    /* _MSC_FULL_VER = VVRRPPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
+#  else
+    /* _MSC_FULL_VER = VVRRPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
+#  endif
+# endif
+# if defined(_MSC_BUILD)
+#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
+# endif
+
+#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__)
+# define COMPILER_ID "ADSP"
+#if defined(__VISUALDSPVERSION__)
+  /* __VISUALDSPVERSION__ = 0xVVRRPP00 */
+# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24)
+# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF)
+# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8  & 0xFF)
+#endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# define COMPILER_ID "IAR"
+# if defined(__VER__) && defined(__ICCARM__)
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
+#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
+#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
+#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
+#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# endif
+
+#elif defined(__SDCC_VERSION_MAJOR) || defined(SDCC)
+# define COMPILER_ID "SDCC"
+# if defined(__SDCC_VERSION_MAJOR)
+#  define COMPILER_VERSION_MAJOR DEC(__SDCC_VERSION_MAJOR)
+#  define COMPILER_VERSION_MINOR DEC(__SDCC_VERSION_MINOR)
+#  define COMPILER_VERSION_PATCH DEC(__SDCC_VERSION_PATCH)
+# else
+  /* SDCC = VRP */
+#  define COMPILER_VERSION_MAJOR DEC(SDCC/100)
+#  define COMPILER_VERSION_MINOR DEC(SDCC/10 % 10)
+#  define COMPILER_VERSION_PATCH DEC(SDCC    % 10)
+# endif
+
+
+/* These compilers are either not known or too old to define an
+  identification macro.  Try to identify the platform and guess that
+  it is the native compiler.  */
+#elif defined(__hpux) || defined(__hpua)
+# define COMPILER_ID "HP"
+
+#else /* unknown compiler */
+# define COMPILER_ID ""
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
+#ifdef SIMULATE_ID
+char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
+#endif
+
+#ifdef __QNXNTO__
+char const* qnxnto = "INFO" ":" "qnxnto[]";
+#endif
+
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
+#endif
+
+#define STRINGIFY_HELPER(X) #X
+#define STRINGIFY(X) STRINGIFY_HELPER(X)
+
+/* Identify known platforms by name.  */
+#if defined(__linux) || defined(__linux__) || defined(linux)
+# define PLATFORM_ID "Linux"
+
+#elif defined(__MSYS__)
+# define PLATFORM_ID "MSYS"
+
+#elif defined(__CYGWIN__)
+# define PLATFORM_ID "Cygwin"
+
+#elif defined(__MINGW32__)
+# define PLATFORM_ID "MinGW"
+
+#elif defined(__APPLE__)
+# define PLATFORM_ID "Darwin"
+
+#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define PLATFORM_ID "Windows"
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD)
+# define PLATFORM_ID "FreeBSD"
+
+#elif defined(__NetBSD__) || defined(__NetBSD)
+# define PLATFORM_ID "NetBSD"
+
+#elif defined(__OpenBSD__) || defined(__OPENBSD)
+# define PLATFORM_ID "OpenBSD"
+
+#elif defined(__sun) || defined(sun)
+# define PLATFORM_ID "SunOS"
+
+#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
+# define PLATFORM_ID "AIX"
+
+#elif defined(__hpux) || defined(__hpux__)
+# define PLATFORM_ID "HP-UX"
+
+#elif defined(__HAIKU__)
+# define PLATFORM_ID "Haiku"
+
+#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
+# define PLATFORM_ID "BeOS"
+
+#elif defined(__QNX__) || defined(__QNXNTO__)
+# define PLATFORM_ID "QNX"
+
+#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
+# define PLATFORM_ID "Tru64"
+
+#elif defined(__riscos) || defined(__riscos__)
+# define PLATFORM_ID "RISCos"
+
+#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
+# define PLATFORM_ID "SINIX"
+
+#elif defined(__UNIX_SV__)
+# define PLATFORM_ID "UNIX_SV"
+
+#elif defined(__bsdos__)
+# define PLATFORM_ID "BSDOS"
+
+#elif defined(_MPRAS) || defined(MPRAS)
+# define PLATFORM_ID "MP-RAS"
+
+#elif defined(__osf) || defined(__osf__)
+# define PLATFORM_ID "OSF1"
+
+#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
+# define PLATFORM_ID "SCO_SV"
+
+#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
+# define PLATFORM_ID "ULTRIX"
+
+#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
+# define PLATFORM_ID "Xenix"
+
+#elif defined(__WATCOMC__)
+# if defined(__LINUX__)
+#  define PLATFORM_ID "Linux"
+
+# elif defined(__DOS__)
+#  define PLATFORM_ID "DOS"
+
+# elif defined(__OS2__)
+#  define PLATFORM_ID "OS2"
+
+# elif defined(__WINDOWS__)
+#  define PLATFORM_ID "Windows3x"
+
+# elif defined(__VXWORKS__)
+#  define PLATFORM_ID "VxWorks"
+
+# else /* unknown platform */
+#  define PLATFORM_ID
+# endif
+
+#elif defined(__INTEGRITY)
+# if defined(INT_178B)
+#  define PLATFORM_ID "Integrity178"
+
+# else /* regular Integrity */
+#  define PLATFORM_ID "Integrity"
+# endif
+
+#else /* unknown platform */
+# define PLATFORM_ID
+
+#endif
+
+/* For windows compilers MSVC and Intel we can determine
+   the architecture of the compiler being used.  This is because
+   the compilers do not have flags that can change the architecture,
+   but rather depend on which compiler is being used
+*/
+#if defined(_WIN32) && defined(_MSC_VER)
+# if defined(_M_IA64)
+#  define ARCHITECTURE_ID "IA64"
+
+# elif defined(_M_ARM64EC)
+#  define ARCHITECTURE_ID "ARM64EC"
+
+# elif defined(_M_X64) || defined(_M_AMD64)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# elif defined(_M_ARM64)
+#  define ARCHITECTURE_ID "ARM64"
+
+# elif defined(_M_ARM)
+#  if _M_ARM == 4
+#   define ARCHITECTURE_ID "ARMV4I"
+#  elif _M_ARM == 5
+#   define ARCHITECTURE_ID "ARMV5I"
+#  else
+#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
+#  endif
+
+# elif defined(_M_MIPS)
+#  define ARCHITECTURE_ID "MIPS"
+
+# elif defined(_M_SH)
+#  define ARCHITECTURE_ID "SHx"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__WATCOMC__)
+# if defined(_M_I86)
+#  define ARCHITECTURE_ID "I86"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# if defined(__ICCARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__ICCRX__)
+#  define ARCHITECTURE_ID "RX"
+
+# elif defined(__ICCRH850__)
+#  define ARCHITECTURE_ID "RH850"
+
+# elif defined(__ICCRL78__)
+#  define ARCHITECTURE_ID "RL78"
+
+# elif defined(__ICCRISCV__)
+#  define ARCHITECTURE_ID "RISCV"
+
+# elif defined(__ICCAVR__)
+#  define ARCHITECTURE_ID "AVR"
+
+# elif defined(__ICC430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__ICCV850__)
+#  define ARCHITECTURE_ID "V850"
+
+# elif defined(__ICC8051__)
+#  define ARCHITECTURE_ID "8051"
+
+# elif defined(__ICCSTM8__)
+#  define ARCHITECTURE_ID "STM8"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__ghs__)
+# if defined(__PPC64__)
+#  define ARCHITECTURE_ID "PPC64"
+
+# elif defined(__ppc__)
+#  define ARCHITECTURE_ID "PPC"
+
+# elif defined(__ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__x86_64__)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(__i386__)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__TI_COMPILER_VERSION__)
+# if defined(__TI_ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__MSP430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__TMS320C28XX__)
+#  define ARCHITECTURE_ID "TMS320C28x"
+
+# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
+#  define ARCHITECTURE_ID "TMS320C6x"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#else
+#  define ARCHITECTURE_ID
+#endif
+
+/* Convert integer to decimal digit literals.  */
+#define DEC(n)                   \
+  ('0' + (((n) / 10000000)%10)), \
+  ('0' + (((n) / 1000000)%10)),  \
+  ('0' + (((n) / 100000)%10)),   \
+  ('0' + (((n) / 10000)%10)),    \
+  ('0' + (((n) / 1000)%10)),     \
+  ('0' + (((n) / 100)%10)),      \
+  ('0' + (((n) / 10)%10)),       \
+  ('0' +  ((n) % 10))
+
+/* Convert integer to hex digit literals.  */
+#define HEX(n)             \
+  ('0' + ((n)>>28 & 0xF)), \
+  ('0' + ((n)>>24 & 0xF)), \
+  ('0' + ((n)>>20 & 0xF)), \
+  ('0' + ((n)>>16 & 0xF)), \
+  ('0' + ((n)>>12 & 0xF)), \
+  ('0' + ((n)>>8  & 0xF)), \
+  ('0' + ((n)>>4  & 0xF)), \
+  ('0' + ((n)     & 0xF))
+
+/* Construct a string literal encoding the version number. */
+#ifdef COMPILER_VERSION
+char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
+
+/* Construct a string literal encoding the version number components. */
+#elif defined(COMPILER_VERSION_MAJOR)
+char const info_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
+  COMPILER_VERSION_MAJOR,
+# ifdef COMPILER_VERSION_MINOR
+  '.', COMPILER_VERSION_MINOR,
+#  ifdef COMPILER_VERSION_PATCH
+   '.', COMPILER_VERSION_PATCH,
+#   ifdef COMPILER_VERSION_TWEAK
+    '.', COMPILER_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct a string literal encoding the internal version number. */
+#ifdef COMPILER_VERSION_INTERNAL
+char const info_version_internal[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
+  'i','n','t','e','r','n','a','l','[',
+  COMPILER_VERSION_INTERNAL,']','\0'};
+#elif defined(COMPILER_VERSION_INTERNAL_STR)
+char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
+#endif
+
+/* Construct a string literal encoding the version number components. */
+#ifdef SIMULATE_VERSION_MAJOR
+char const info_simulate_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
+  SIMULATE_VERSION_MAJOR,
+# ifdef SIMULATE_VERSION_MINOR
+  '.', SIMULATE_VERSION_MINOR,
+#  ifdef SIMULATE_VERSION_PATCH
+   '.', SIMULATE_VERSION_PATCH,
+#   ifdef SIMULATE_VERSION_TWEAK
+    '.', SIMULATE_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
+char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
+
+
+
+#if !defined(__STDC__) && !defined(__clang__)
+# if defined(_MSC_VER) || defined(__ibmxl__) || defined(__IBMC__)
+#  define C_VERSION "90"
+# else
+#  define C_VERSION
+# endif
+#elif __STDC_VERSION__ > 201710L
+# define C_VERSION "23"
+#elif __STDC_VERSION__ >= 201710L
+# define C_VERSION "17"
+#elif __STDC_VERSION__ >= 201000L
+# define C_VERSION "11"
+#elif __STDC_VERSION__ >= 199901L
+# define C_VERSION "99"
+#else
+# define C_VERSION "90"
+#endif
+const char* info_language_standard_default =
+  "INFO" ":" "standard_default[" C_VERSION "]";
+
+const char* info_language_extensions_default = "INFO" ":" "extensions_default["
+/* !defined(_MSC_VER) to exclude Clang's MSVC compatibility mode. */
+#if (defined(__clang__) || defined(__GNUC__) ||                               \
+     defined(__TI_COMPILER_VERSION__)) &&                                     \
+  !defined(__STRICT_ANSI__) && !defined(_MSC_VER)
+  "ON"
+#else
+  "OFF"
+#endif
+"]";
+
+/*--------------------------------------------------------------------------*/
+
+#ifdef ID_VOID_MAIN
+void main() {}
+#else
+# if defined(__CLASSIC_C__)
+int main(argc, argv) int argc; char *argv[];
+# else
+int main(int argc, char* argv[])
+# endif
+{
+  int require = 0;
+  require += info_compiler[argc];
+  require += info_platform[argc];
+  require += info_arch[argc];
+#ifdef COMPILER_VERSION_MAJOR
+  require += info_version[argc];
+#endif
+#ifdef COMPILER_VERSION_INTERNAL
+  require += info_version_internal[argc];
+#endif
+#ifdef SIMULATE_ID
+  require += info_simulate[argc];
+#endif
+#ifdef SIMULATE_VERSION_MAJOR
+  require += info_simulate_version[argc];
+#endif
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+  require += info_cray[argc];
+#endif
+  require += info_language_standard_default[argc];
+  require += info_language_extensions_default[argc];
+  (void)argv;
+  return require;
+}
+#endif
diff --git a/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/a.out b/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/a.out
new file mode 100755
index 0000000000000000000000000000000000000000..30f4c1c1a2b997c05c3def9a97c52e99170eb848
GIT binary patch
literal 16256
zcmeHOeQXrR6`wns;3R~z9jIMCm(+<&f%Tm+;1WL8_=CO59|R0?Wyp4Yx3*8*hr74e
zoN3h%kt)zA5lN*=l_HftB30@iQBl(>id0HWMN(C2%|8vTA{8Pc5vfV2nzksST;H2{
z?^rK(`k_eqXUE$8&3p5HZ)V@jt!H;<p6u=Ki$)@XQ&j8{C>iyJRH-w_ol+!Sr+7dZ
zVzpQ$?g33L9EL1GR5`}WQB}ZLM|L%U$ZN)oL}L!Fl9GTLRN@vQBJWNzD66o%NQsQ1
zUdhOlMI;NlKE|_D0i&NM>(i7Up&2Se_!{vT{XC3Kv#!jWcPSpDL2)q&nTI0sT8Y<6
zJjS0=eT*DWY$G0>6wl5Y10<9cYA8U`<XMOyZ-jUwWY5U<j2!y|?|JF}r@no}t6irW
zQoGKO8ODuDEf~4J$AE`%`EL_<^M0ytp?<g?Q63pd6m8ke*w&7=Y<g2RlP{Gwl{>d?
z+S;KP3wpaGV7ssm98-G-4~pn<aUONd=W>*>9216^yBiU?Sgx+?Dag?n1-#_^E+-WW
zoX}(AJZ0!ln@7KT0X@cfKKnk<KLp2J$x03nSZT*~CNf3Oafb)GvxU4fY>#CfVObNo
zLf$HRw(D7zkfsY5rpmT8p2^$U%rWFav{S^(*_pfmIfudCin)}^?d?zQ>9*SSc72QB
z{N@g5U=*T=e;D5gNZ$bD{x7Y_+>4R_7U^Dn;Rg`zsCWisOqX{X60r|3cAT#G^xS7y
zcHO6^pNyp3@acKXV!=(Ho^DOjK0JjH2qO?iAdEm5fiMDL1i}b}5%@eJ@VD5SPs|g4
zsWYEl^v5<K%<rD}qLo?m#INfv$mUeGybJ2e!`I*(TWdgW26E%4xvj`ge2_eK>2dSa
zyXJ{&H}(zpCe9~*W4`%jyaCLY7s34FaXq&7X-P4k+j$i<=5vewgz%M{-g2n=5ayxk
zdn;FBYmcMh1wybtEA6-Z1Y!Mc^VALVjSqL5Z``agBNxp}x4mWH;9+u5SGh7CTibJo
zkL^3YGY7?@wBew6VrLYy=Bf9*CFZj`*MM++eWg;lo`%!KMZF+LjzHbj`jP+s6HrL)
ziM?{T<<?XEEw_y05Rg}fTW&$^e};#3Gxht}xmt+h@~a@l&MmtyG6O~LNTIeacJAzH
z%@C1`kw1X;7gg;Gv#3SO*QIVpUF7Y^KSBH%nM7`0sP<8{SKt!D6h<J7Kp25A0$~Kg
z2!s&`BM?R)j6fKHFalu&{)Y(Q_p;<*-_Z6{AvcxDI__vP?>X82{zC&tR6#M5D`joZ
z8HLZ7q?!VGyx`_W`|;Z-DQ!13Im#T*&ZliR4Ic`}?NZhoO>CnQr|dcTVy2KUR?CJ4
zkKAeSoVHlsbX>F@jYDdWw|D56!BoxosM3k#bCc)rfmP+VGe3d+$jX{WTLJOA-Kvi&
zmBWA=0r8E+4nTOFAU58pR9*sH1_!3dvHc=au8*u-vbgRvd^kk?DmZR@SgAaM20e@A
zC&(7a;y23IKra<b>-&~Ae>Hah6LrVM?iG)Id2@^W%!+)h_YBzaw|NldcT5lPB(wuD
zOko7V2!s&`BM?R)j6fKHFalu&!U%*B_^czK!N*%nS7@CyuZ^6LT6q2;#I=@~mMDM+
z7DMJ9Ao*qT^F@+*{qsta`B`TL$=WVM=I~hx6PN$}cBO!FxyC?HYq<gAwFZi`h8CWm
zsQf;u72-w0`w5!~mlHBn*K!($R4wL$MM~y&lt~YZN@Zeyw=1fBv(z5=Vl3Osae~K3
zDbG+j(LZ<Kb>#~+M5B1r!e?qss|n+Ty@Zbu9wpq{-Mw9FJvdg%dnGNA(BaX0Q=%lr
z#CO`^`MYC1sfB#u(7J!U2J48s_9eBRecc;2w85cu(>n=GJ^Sl%t(9{+WHu+_iEZm;
zvlcvI{A<NuD?tP7KU^^!T<8Aa0`_Y_AB9~HQ#8KWpGNvK0sUsu^Y4f4laPssW-($I
zQpMu~*K<e2gM!DuEQHgy;lO(N_sIH_kco&!!M`Wg{}?j!UGGBI^}6)aAYNT(u!}0r
z=jSbHzf1(j%_YgI7rc(2om`XlUl4+>m-U}OrjZ37HbFo%pUl}2sc#a^0Y95TUoV=f
z@nrk$pl^&W6@33;y#e|L5fixV0#my@3vr121pTK#uaS)(T=Wxdg6$xNUoD<QJ^1H+
zBDGZdywo>|PO|6w$xEc)7tsF_^z|@Kz8>&@81#*}`-aAweAfhZ%#feffS+0N!{dz`
z`wscRMLL-H-O^R`1J@O_|A73P0sl*)^T+2A&^JK8!~^ynpnpi#IQKLJ`o<{SwFAGS
zgP_NF;^G-h++Sl--vq--={cU_K+5$JdI9zn*kc*XvnK>7ie71aTu+HR`xY!OXQgoG
zLJ=&}1#2Q(7_+lh+AFw4%Py4#z7Neho|D$&^IPEdg^Xppu03rzdC#2|<F1`^taK@t
zn+A$cvcS@tW7<7nA9cF<{+3Kb+bpYZf7d{-)jQaOdnbCnHrO?g>;^TwceQ#=Dm8od
z3u|xx(4MY-YpAd9K<}_M+_k5_7Z|}!6ozk;#Ha4saMHGC3wb>vlOHcw!S|=&8%@*(
z4G`w;^03myf;9;*d~pXzatN%_nY>jhI%(yN-xez=Yv(6Q_Jm{I{U%sI9ErnkqoJ~~
zVv#DAJ3}lBwwzFqWMiuiVcW@^m(HKE|76ZpLLL6yDAlmxJ`>q<s^Ro5S_$~FTL`^4
zo%8H5K+jb;$+--?MR%u!o-cTgK9Mi!Q*L3(alL6$b?r%R!oexaChg*+(9_d-@Tbso
zRS8!r%}B7I%yqIh5`>OJOXyQsPoF42#)Gg59fu|u6=cWi&Lj=S$uv+|QdBi66@&BO
z*Un{9(1L=8o}o!NKy>H{p+nE)V7SX4<7aTfc_7?c;D4M-%{7M1@V-Ev2U4-L<vik-
z;J~>W^E-*p``|Pv^QY>4f|OeU{d}Ge@P0VWPli0-&sp6AnSa3Cj_WT|{bia^wvZw3
z{R!4T2<138W&RoBpCLXkw7dswB5*LD=beW@ho;Qu`EZ;X`XZG16Z-I{$3eikD)V`M
zIzxP(*K_;Xj`1k)aqi19&wpo$AE(NK?dLWegAAH7U(kG;C-^+y=K6!z|1@y%n$W*Y
z{QpCKY!QN=Gq{+V#iF<dIX_<`zD9h4sy)k$-wW{L#E+8!x0fBT%=rBPpZ~8L#7_r0
zjO%58KMe4BzVmW}vf$@1whvzakBDE5zh)>?O+oZO)Q|H`G+{o^|9F2V?l1JGVEsRb
zLO(yg&XDHub&7RTLG{dWW5ur^vlI^cxm4%Fyx%iu&obk0feW*>Dxc@y8Z5uX#CA%$
zBeVW(D8RAD_4B-#_wVvPT^^sz<9>V|xOn}{=lzJ6+dv7aRYCR4e8wwK;^)JJNZG~}
z@O?r_cenq2DDd-n{@YA^gNcM(FRMR-4C=W3JZ~H!e$XE4@~4kMKpmM^i<po17k^^D
zA9?Kg?cU=0X~5xqpb-vk561<c1G(*f{%nVmvH|B@FWchJY(D;uZOY<bY)Ba7{~NAG
BSUms$

literal 0
HcmV?d00001

diff --git a/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/CMakeCXXCompilerId.cpp b/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/CMakeCXXCompilerId.cpp
new file mode 100644
index 0000000000000..25c62a8c3cb40
--- /dev/null
+++ b/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/CMakeCXXCompilerId.cpp
@@ -0,0 +1,791 @@
+/* This source file must have a .cpp extension so that all C++ compilers
+   recognize the extension without flags.  Borland does not know .cxx for
+   example.  */
+#ifndef __cplusplus
+# error "A C compiler has been selected for C++."
+#endif
+
+#if !defined(__has_include)
+/* If the compiler does not have __has_include, pretend the answer is
+   always no.  */
+#  define __has_include(x) 0
+#endif
+
+
+/* Version number components: V=Version, R=Revision, P=Patch
+   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
+
+#if defined(__COMO__)
+# define COMPILER_ID "Comeau"
+  /* __COMO_VERSION__ = VRR */
+# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100)
+# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100)
+
+#elif defined(__INTEL_COMPILER) || defined(__ICC)
+# define COMPILER_ID "Intel"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_ID "GNU"
+# endif
+  /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
+     except that a few beta releases use the old format with V=2021.  */
+# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
+#  if defined(__INTEL_COMPILER_UPDATE)
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
+#  else
+#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
+#  endif
+# else
+#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
+#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
+   /* The third version component from --version is an update index,
+      but no macro is provided for it.  */
+#  define COMPILER_VERSION_PATCH DEC(0)
+# endif
+# if defined(__INTEL_COMPILER_BUILD_DATE)
+   /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
+#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
+# endif
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# if defined(__GNUC__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+# elif defined(__GNUG__)
+#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
+# define COMPILER_ID "IntelLLVM"
+#if defined(_MSC_VER)
+# define SIMULATE_ID "MSVC"
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_ID "GNU"
+#endif
+/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
+ * later.  Look for 6 digit vs. 8 digit version number to decide encoding.
+ * VVVV is no smaller than the current year when a version is released.
+ */
+#if __INTEL_LLVM_COMPILER < 1000000L
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER    % 10)
+#else
+# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
+# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER     % 100)
+#endif
+#if defined(_MSC_VER)
+  /* _MSC_VER = VVRR */
+# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+#endif
+#if defined(__GNUC__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
+#elif defined(__GNUG__)
+# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
+#endif
+#if defined(__GNUC_MINOR__)
+# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
+#endif
+#if defined(__GNUC_PATCHLEVEL__)
+# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+#endif
+
+#elif defined(__PATHCC__)
+# define COMPILER_ID "PathScale"
+# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
+# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
+# if defined(__PATHCC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
+# endif
+
+#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
+# define COMPILER_ID "Embarcadero"
+# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
+# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
+# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
+
+#elif defined(__BORLANDC__)
+# define COMPILER_ID "Borland"
+  /* __BORLANDC__ = 0xVRR */
+# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
+# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
+
+#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
+# define COMPILER_ID "Watcom"
+   /* __WATCOMC__ = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__WATCOMC__)
+# define COMPILER_ID "OpenWatcom"
+   /* __WATCOMC__ = VVRP + 1100 */
+# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
+# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
+# if (__WATCOMC__ % 10) > 0
+#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
+# endif
+
+#elif defined(__SUNPRO_CC)
+# define COMPILER_ID "SunPro"
+# if __SUNPRO_CC >= 0x5100
+   /* __SUNPRO_CC = 0xVRRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# else
+   /* __SUNPRO_CC = 0xVRP */
+#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8)
+#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF)
+#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
+# endif
+
+#elif defined(__HP_aCC)
+# define COMPILER_ID "HP"
+  /* __HP_aCC = VVRRPP */
+# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000)
+# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100)
+# define COMPILER_VERSION_PATCH DEC(__HP_aCC     % 100)
+
+#elif defined(__DECCXX)
+# define COMPILER_ID "Compaq"
+  /* __DECCXX_VER = VVRRTPPPP */
+# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000)
+# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000  % 100)
+# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER         % 10000)
+
+#elif defined(__IBMCPP__) && defined(__COMPILER_VER__)
+# define COMPILER_ID "zOS"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__ibmxl__) && defined(__clang__)
+# define COMPILER_ID "XLClang"
+# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
+# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
+# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
+# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
+
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800
+# define COMPILER_ID "XL"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800
+# define COMPILER_ID "VisualAge"
+  /* __IBMCPP__ = VRP */
+# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
+# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
+
+#elif defined(__NVCOMPILER)
+# define COMPILER_ID "NVHPC"
+# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
+# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
+# if defined(__NVCOMPILER_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
+# endif
+
+#elif defined(__PGI)
+# define COMPILER_ID "PGI"
+# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
+# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
+# if defined(__PGIC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
+# endif
+
+#elif defined(_CRAYC)
+# define COMPILER_ID "Cray"
+# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
+# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
+
+#elif defined(__TI_COMPILER_VERSION__)
+# define COMPILER_ID "TI"
+  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
+# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
+# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
+# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
+
+#elif defined(__CLANG_FUJITSU)
+# define COMPILER_ID "FujitsuClang"
+# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# define COMPILER_VERSION_INTERNAL_STR __clang_version__
+
+
+#elif defined(__FUJITSU)
+# define COMPILER_ID "Fujitsu"
+# if defined(__FCC_version__)
+#   define COMPILER_VERSION __FCC_version__
+# elif defined(__FCC_major__)
+#   define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
+#   define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
+#   define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
+# endif
+# if defined(__fcc_version)
+#   define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
+# elif defined(__FCC_VERSION)
+#   define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
+# endif
+
+
+#elif defined(__ghs__)
+# define COMPILER_ID "GHS"
+/* __GHS_VERSION_NUMBER = VVVVRP */
+# ifdef __GHS_VERSION_NUMBER
+# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
+# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
+# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
+# endif
+
+#elif defined(__SCO_VERSION__)
+# define COMPILER_ID "SCO"
+
+#elif defined(__ARMCC_VERSION) && !defined(__clang__)
+# define COMPILER_ID "ARMCC"
+#if __ARMCC_VERSION >= 1000000
+  /* __ARMCC_VERSION = VRRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
+#else
+  /* __ARMCC_VERSION = VRPPPP */
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
+#endif
+
+
+#elif defined(__clang__) && defined(__apple_build_version__)
+# define COMPILER_ID "AppleClang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
+
+#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
+# define COMPILER_ID "ARMClang"
+  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
+  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
+  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION     % 10000)
+# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
+
+#elif defined(__clang__)
+# define COMPILER_ID "Clang"
+# if defined(_MSC_VER)
+#  define SIMULATE_ID "MSVC"
+# endif
+# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
+# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
+# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
+# if defined(_MSC_VER)
+   /* _MSC_VER = VVRR */
+#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
+#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
+# endif
+
+#elif defined(__GNUC__) || defined(__GNUG__)
+# define COMPILER_ID "GNU"
+# if defined(__GNUC__)
+#  define COMPILER_VERSION_MAJOR DEC(__GNUC__)
+# else
+#  define COMPILER_VERSION_MAJOR DEC(__GNUG__)
+# endif
+# if defined(__GNUC_MINOR__)
+#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
+# endif
+# if defined(__GNUC_PATCHLEVEL__)
+#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
+# endif
+
+#elif defined(_MSC_VER)
+# define COMPILER_ID "MSVC"
+  /* _MSC_VER = VVRR */
+# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
+# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
+# if defined(_MSC_FULL_VER)
+#  if _MSC_VER >= 1400
+    /* _MSC_FULL_VER = VVRRPPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
+#  else
+    /* _MSC_FULL_VER = VVRRPPPP */
+#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
+#  endif
+# endif
+# if defined(_MSC_BUILD)
+#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
+# endif
+
+#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__)
+# define COMPILER_ID "ADSP"
+#if defined(__VISUALDSPVERSION__)
+  /* __VISUALDSPVERSION__ = 0xVVRRPP00 */
+# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24)
+# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF)
+# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8  & 0xFF)
+#endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# define COMPILER_ID "IAR"
+# if defined(__VER__) && defined(__ICCARM__)
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
+#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
+#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
+#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
+#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
+#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
+#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
+# endif
+
+
+/* These compilers are either not known or too old to define an
+  identification macro.  Try to identify the platform and guess that
+  it is the native compiler.  */
+#elif defined(__hpux) || defined(__hpua)
+# define COMPILER_ID "HP"
+
+#else /* unknown compiler */
+# define COMPILER_ID ""
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
+#ifdef SIMULATE_ID
+char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
+#endif
+
+#ifdef __QNXNTO__
+char const* qnxnto = "INFO" ":" "qnxnto[]";
+#endif
+
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
+#endif
+
+#define STRINGIFY_HELPER(X) #X
+#define STRINGIFY(X) STRINGIFY_HELPER(X)
+
+/* Identify known platforms by name.  */
+#if defined(__linux) || defined(__linux__) || defined(linux)
+# define PLATFORM_ID "Linux"
+
+#elif defined(__MSYS__)
+# define PLATFORM_ID "MSYS"
+
+#elif defined(__CYGWIN__)
+# define PLATFORM_ID "Cygwin"
+
+#elif defined(__MINGW32__)
+# define PLATFORM_ID "MinGW"
+
+#elif defined(__APPLE__)
+# define PLATFORM_ID "Darwin"
+
+#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
+# define PLATFORM_ID "Windows"
+
+#elif defined(__FreeBSD__) || defined(__FreeBSD)
+# define PLATFORM_ID "FreeBSD"
+
+#elif defined(__NetBSD__) || defined(__NetBSD)
+# define PLATFORM_ID "NetBSD"
+
+#elif defined(__OpenBSD__) || defined(__OPENBSD)
+# define PLATFORM_ID "OpenBSD"
+
+#elif defined(__sun) || defined(sun)
+# define PLATFORM_ID "SunOS"
+
+#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
+# define PLATFORM_ID "AIX"
+
+#elif defined(__hpux) || defined(__hpux__)
+# define PLATFORM_ID "HP-UX"
+
+#elif defined(__HAIKU__)
+# define PLATFORM_ID "Haiku"
+
+#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
+# define PLATFORM_ID "BeOS"
+
+#elif defined(__QNX__) || defined(__QNXNTO__)
+# define PLATFORM_ID "QNX"
+
+#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
+# define PLATFORM_ID "Tru64"
+
+#elif defined(__riscos) || defined(__riscos__)
+# define PLATFORM_ID "RISCos"
+
+#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
+# define PLATFORM_ID "SINIX"
+
+#elif defined(__UNIX_SV__)
+# define PLATFORM_ID "UNIX_SV"
+
+#elif defined(__bsdos__)
+# define PLATFORM_ID "BSDOS"
+
+#elif defined(_MPRAS) || defined(MPRAS)
+# define PLATFORM_ID "MP-RAS"
+
+#elif defined(__osf) || defined(__osf__)
+# define PLATFORM_ID "OSF1"
+
+#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
+# define PLATFORM_ID "SCO_SV"
+
+#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
+# define PLATFORM_ID "ULTRIX"
+
+#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
+# define PLATFORM_ID "Xenix"
+
+#elif defined(__WATCOMC__)
+# if defined(__LINUX__)
+#  define PLATFORM_ID "Linux"
+
+# elif defined(__DOS__)
+#  define PLATFORM_ID "DOS"
+
+# elif defined(__OS2__)
+#  define PLATFORM_ID "OS2"
+
+# elif defined(__WINDOWS__)
+#  define PLATFORM_ID "Windows3x"
+
+# elif defined(__VXWORKS__)
+#  define PLATFORM_ID "VxWorks"
+
+# else /* unknown platform */
+#  define PLATFORM_ID
+# endif
+
+#elif defined(__INTEGRITY)
+# if defined(INT_178B)
+#  define PLATFORM_ID "Integrity178"
+
+# else /* regular Integrity */
+#  define PLATFORM_ID "Integrity"
+# endif
+
+#else /* unknown platform */
+# define PLATFORM_ID
+
+#endif
+
+/* For windows compilers MSVC and Intel we can determine
+   the architecture of the compiler being used.  This is because
+   the compilers do not have flags that can change the architecture,
+   but rather depend on which compiler is being used
+*/
+#if defined(_WIN32) && defined(_MSC_VER)
+# if defined(_M_IA64)
+#  define ARCHITECTURE_ID "IA64"
+
+# elif defined(_M_ARM64EC)
+#  define ARCHITECTURE_ID "ARM64EC"
+
+# elif defined(_M_X64) || defined(_M_AMD64)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# elif defined(_M_ARM64)
+#  define ARCHITECTURE_ID "ARM64"
+
+# elif defined(_M_ARM)
+#  if _M_ARM == 4
+#   define ARCHITECTURE_ID "ARMV4I"
+#  elif _M_ARM == 5
+#   define ARCHITECTURE_ID "ARMV5I"
+#  else
+#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
+#  endif
+
+# elif defined(_M_MIPS)
+#  define ARCHITECTURE_ID "MIPS"
+
+# elif defined(_M_SH)
+#  define ARCHITECTURE_ID "SHx"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__WATCOMC__)
+# if defined(_M_I86)
+#  define ARCHITECTURE_ID "I86"
+
+# elif defined(_M_IX86)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
+# if defined(__ICCARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__ICCRX__)
+#  define ARCHITECTURE_ID "RX"
+
+# elif defined(__ICCRH850__)
+#  define ARCHITECTURE_ID "RH850"
+
+# elif defined(__ICCRL78__)
+#  define ARCHITECTURE_ID "RL78"
+
+# elif defined(__ICCRISCV__)
+#  define ARCHITECTURE_ID "RISCV"
+
+# elif defined(__ICCAVR__)
+#  define ARCHITECTURE_ID "AVR"
+
+# elif defined(__ICC430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__ICCV850__)
+#  define ARCHITECTURE_ID "V850"
+
+# elif defined(__ICC8051__)
+#  define ARCHITECTURE_ID "8051"
+
+# elif defined(__ICCSTM8__)
+#  define ARCHITECTURE_ID "STM8"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__ghs__)
+# if defined(__PPC64__)
+#  define ARCHITECTURE_ID "PPC64"
+
+# elif defined(__ppc__)
+#  define ARCHITECTURE_ID "PPC"
+
+# elif defined(__ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__x86_64__)
+#  define ARCHITECTURE_ID "x64"
+
+# elif defined(__i386__)
+#  define ARCHITECTURE_ID "X86"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#elif defined(__TI_COMPILER_VERSION__)
+# if defined(__TI_ARM__)
+#  define ARCHITECTURE_ID "ARM"
+
+# elif defined(__MSP430__)
+#  define ARCHITECTURE_ID "MSP430"
+
+# elif defined(__TMS320C28XX__)
+#  define ARCHITECTURE_ID "TMS320C28x"
+
+# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
+#  define ARCHITECTURE_ID "TMS320C6x"
+
+# else /* unknown architecture */
+#  define ARCHITECTURE_ID ""
+# endif
+
+#else
+#  define ARCHITECTURE_ID
+#endif
+
+/* Convert integer to decimal digit literals.  */
+#define DEC(n)                   \
+  ('0' + (((n) / 10000000)%10)), \
+  ('0' + (((n) / 1000000)%10)),  \
+  ('0' + (((n) / 100000)%10)),   \
+  ('0' + (((n) / 10000)%10)),    \
+  ('0' + (((n) / 1000)%10)),     \
+  ('0' + (((n) / 100)%10)),      \
+  ('0' + (((n) / 10)%10)),       \
+  ('0' +  ((n) % 10))
+
+/* Convert integer to hex digit literals.  */
+#define HEX(n)             \
+  ('0' + ((n)>>28 & 0xF)), \
+  ('0' + ((n)>>24 & 0xF)), \
+  ('0' + ((n)>>20 & 0xF)), \
+  ('0' + ((n)>>16 & 0xF)), \
+  ('0' + ((n)>>12 & 0xF)), \
+  ('0' + ((n)>>8  & 0xF)), \
+  ('0' + ((n)>>4  & 0xF)), \
+  ('0' + ((n)     & 0xF))
+
+/* Construct a string literal encoding the version number. */
+#ifdef COMPILER_VERSION
+char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
+
+/* Construct a string literal encoding the version number components. */
+#elif defined(COMPILER_VERSION_MAJOR)
+char const info_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
+  COMPILER_VERSION_MAJOR,
+# ifdef COMPILER_VERSION_MINOR
+  '.', COMPILER_VERSION_MINOR,
+#  ifdef COMPILER_VERSION_PATCH
+   '.', COMPILER_VERSION_PATCH,
+#   ifdef COMPILER_VERSION_TWEAK
+    '.', COMPILER_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct a string literal encoding the internal version number. */
+#ifdef COMPILER_VERSION_INTERNAL
+char const info_version_internal[] = {
+  'I', 'N', 'F', 'O', ':',
+  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
+  'i','n','t','e','r','n','a','l','[',
+  COMPILER_VERSION_INTERNAL,']','\0'};
+#elif defined(COMPILER_VERSION_INTERNAL_STR)
+char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
+#endif
+
+/* Construct a string literal encoding the version number components. */
+#ifdef SIMULATE_VERSION_MAJOR
+char const info_simulate_version[] = {
+  'I', 'N', 'F', 'O', ':',
+  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
+  SIMULATE_VERSION_MAJOR,
+# ifdef SIMULATE_VERSION_MINOR
+  '.', SIMULATE_VERSION_MINOR,
+#  ifdef SIMULATE_VERSION_PATCH
+   '.', SIMULATE_VERSION_PATCH,
+#   ifdef SIMULATE_VERSION_TWEAK
+    '.', SIMULATE_VERSION_TWEAK,
+#   endif
+#  endif
+# endif
+  ']','\0'};
+#endif
+
+/* Construct the string literal in pieces to prevent the source from
+   getting matched.  Store it in a pointer rather than an array
+   because some compilers will just produce instructions to fill the
+   array rather than assigning a pointer to a static array.  */
+char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
+char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
+
+
+
+#if defined(__INTEL_COMPILER) && defined(_MSVC_LANG) && _MSVC_LANG < 201403L
+#  if defined(__INTEL_CXX11_MODE__)
+#    if defined(__cpp_aggregate_nsdmi)
+#      define CXX_STD 201402L
+#    else
+#      define CXX_STD 201103L
+#    endif
+#  else
+#    define CXX_STD 199711L
+#  endif
+#elif defined(_MSC_VER) && defined(_MSVC_LANG)
+#  define CXX_STD _MSVC_LANG
+#else
+#  define CXX_STD __cplusplus
+#endif
+
+const char* info_language_standard_default = "INFO" ":" "standard_default["
+#if CXX_STD > 202002L
+  "23"
+#elif CXX_STD > 201703L
+  "20"
+#elif CXX_STD >= 201703L
+  "17"
+#elif CXX_STD >= 201402L
+  "14"
+#elif CXX_STD >= 201103L
+  "11"
+#else
+  "98"
+#endif
+"]";
+
+const char* info_language_extensions_default = "INFO" ":" "extensions_default["
+/* !defined(_MSC_VER) to exclude Clang's MSVC compatibility mode. */
+#if (defined(__clang__) || defined(__GNUC__) ||                               \
+     defined(__TI_COMPILER_VERSION__)) &&                                     \
+  !defined(__STRICT_ANSI__) && !defined(_MSC_VER)
+  "ON"
+#else
+  "OFF"
+#endif
+"]";
+
+/*--------------------------------------------------------------------------*/
+
+int main(int argc, char* argv[])
+{
+  int require = 0;
+  require += info_compiler[argc];
+  require += info_platform[argc];
+#ifdef COMPILER_VERSION_MAJOR
+  require += info_version[argc];
+#endif
+#ifdef COMPILER_VERSION_INTERNAL
+  require += info_version_internal[argc];
+#endif
+#ifdef SIMULATE_ID
+  require += info_simulate[argc];
+#endif
+#ifdef SIMULATE_VERSION_MAJOR
+  require += info_simulate_version[argc];
+#endif
+#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
+  require += info_cray[argc];
+#endif
+  require += info_language_standard_default[argc];
+  require += info_language_extensions_default[argc];
+  (void)argv;
+  return require;
+}
diff --git a/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/a.out b/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/a.out
new file mode 100755
index 0000000000000000000000000000000000000000..5bbad36aa9e184b4c3bc102bcdf2b1644295c98f
GIT binary patch
literal 16264
zcmeHOYit}>6~1FT#BI~86Jp}#(M*D=Y#dK^H%?+&l8oa=#+B{7oMLd2$#{2cud)wo
zceb@d1&-39gpeu-2?Ys>2!s&)C_*Yye}%MxgogyqABgw?Zj?$XkJgBS3<_D!x%ZsS
zc(UdZA@OIfHFNIy?)mPybLQ^&&YgQd+uhewrD=jwmDnXvGwVzVsk0DPyHp8l7mo{5
ztQYIV2LP#o!;~c`RgUG&DpkO;mh7rQBCi26Qko0oI;jcppc1zbCGzekld?))T%uH#
zqE6|^lSQNpx;~cQBnK@0Jn2tY^$6Wm8IrFPkENf7v6*jG=JPuhkL7WS%Q#8qp^Cg(
zL-A^NDIUui^2d_niEYHAo#NSEt#~YTD8r=7Gt^I#c|*h-A|6Y&XDMR=+(U1B$%pZ`
zpLjJJRRd}^nli(3vw{Un_IChy7?=Mv@ohdx{ua9rwG@Ar8addKPK|7D+mcSUrc>GC
zWb0)6_SWrfMj>a!r3JPNePEy3J8)Q3O^YjtF`vs($8t;<V!qudk&ET(x=ulk`Y6a7
z?$0w4SmcB?uB=doZPF{?=%*5najt~-0KN&12a;`%WpY`&;5m8EwuRk0*l#D@ygQaE
zcy4~MzcZc7x`WP0+ErzhOYFp?V~?h?PC9iG3~h*Zg6hmTsjL9K;3X3+Eg13bsAkH!
zu>_c+7DGIt@V$M#dphm7v9(QbHg0Ug)>lJ}@lQn_<tu=4|9I_-Uxx5ki;IL6*Q6f7
zyYvmDLb|!zloIa|JzAtb>%%d&obLE=zSmeb=fmlzCMokioclW070NIIVFbbmgb@fM
z5Jn)3Kp25A0{`m>{5kUE->uVssI|^5d20*2=D&Q!t18V{r+-*`Rk|u|y9IFRBe&rk
z*<eEM>yR70!0kYO?2o-?ZydAE-m*^Lp4&gz9lH|yiFN&YbOo5#e;Ul+A2lKyUXm2+
z{7&iT{E}Zof%UO@uO3_<!#uctq%<4ZFpY*+NrL_Nu<-o0@1SVDXPuq1-gtMn^~QX)
zrCqab-1AmJ1+7#;ZE1EivZ3ofAKN#*GXuq<*mBr9z4I2nSX*b`_Lf=ac0LLCo#s-h
zbSDX?YfHL8*N%ekvVY{i^8yr7dm^tMX}o)?uko%q4FP#|u<<T<|1H#MC4LdPTmx~e
ze+#t8<yDVrZ$>Vkx}clFI=Agd5^GzoUDJLE=!`_`ua_;Mk~gJNQ>)Erb5}+AzLf6-
zG&@XT1i}b}5eOp?Mj(tp7=bVXVFbbmgb@fM5Juo3BY@w-dIx&;J)Ov9CQ@lPKir%3
z+;m^xbNxqEK_QhXrX9~6hR>J;O@Ka{%V&oB@Y^VXPChX{%pA|jCY^i|z6_2!#k4mZ
z+d(Dnq~~S}sa&>DF55S7^nT+Pw8ehj&7<vb6jD4IZ!<81shsgqWhIi&O<uqUR+%4X
zehm5A+UjSTK;rkeb?=o*M?h`{i8B@-2MN;&V)I<7^i`0n;J~DvJSeouI&JN;rL`B}
z10dqKR$=blQt2r)=vpd2F*ZWB4UX3Vm%#G6p5+bCMjm^ic3SLS^UO!LHf}<5<fGq9
zV8`F)5l4;(#~EnH7y@AmBM?R)j6fKHFalu&!U%*B2qO?iAdJ9&837$W&SILSb<n(q
zk>4>`e_mRtb*h-14(P9tzKkTVr+%FDi{$4C(s}*!TGIJlXASdrnKFm(Qkb~>FZW70
z)F-P=6y>#;(Hf=W`b<ohsQgi-itmzqjAR4JdXg;4Ycfq!LQAPYU!rtw$0XsfNK_`S
z?_No@Z;sjnUyNmYIgar9DD_z?C;I<g@VfHF8lqD?YT%1Cru8JFB)dr-AbEo1-p<aa
z^`^rk#jID<V=)6>y<1~Nsm4ANhxhNcW`c#BaA-Q%ti!_Lj{UuQ*Z$6ymMw7Toirz*
zt9O4RsyA`YfXvodG`6EzHfzxn#=l1VsT4H8`=`OW-{3m;4=!T=B;ZxB>tTZT4N!$T
z;g<sVR>JxBMdqRxGMZ=*L#8Pq?jN|GTNB`&`oAoM(~EGRU;e!^eg-m{SSk4T#Q3)$
zQ|WpyOZ$kx^$3{a@+`y*NKHfpu1LVd_+oMRJ=kbsg?MeFiRKGZCBJ`@7IlKx_cLJ`
zT(}0a9>M-sFYylvLGfYx7QpMo>T=u|f1*kV9q@*LeH`ppz;*F{jT$0!0KSRIK6wQ2
zl@MoKw1SDBgAUeHwHm5XxGaQIR@T2-v=h$HhZhMq1Ni5#{;K7IpNCw}7Z7K~C%;Yg
z><`rQQ^K19`1&f<ev|s)Q&iVY=@*`cgtLp^U_EeM+z%Q59n~KV)IU$S6~I@6U%W5D
z_%}+tK^!N0uD_Y^px<qP>ultc2NuDPFM^9hiM$sxa<Hes8A;imGbR8ic*W6CBO&f@
zRIt5_oxrUM1+Yx!?6Gui#7WypFPAUaPH|G;d}!MB+@uk$Y=PSsQnr)NJ5#os_3~3<
zH1A|wJ6X(RrhwwpY_Rken0EF%C*01VA%4R3CXK|zgs?x=7mKH|qdA_rL?j-QNN`dU
zT=X35=<l|>2fA=0h7I$xcDF^K)pbzVU7sB2=<n?m_TIjIdpi2;eLX#gx(DsSjy-+d
zz;Kg}=fEzFeX#Q)nabM5f}4bT{97|j|7HrgGXtVk$l2pCzY9A&WW#tiSZO#7EM#n;
zs=&66a&53nLsHXDcC6@(x%Pw8!peYWrG*S3)Ce0)=z<pnB~TsgPFXPj3`D$EwZjB1
zJd4*kQYbL+uQst%*zZztPm3@LQyI@00qNzH9OqmL=G5~O!pP=4*BHwdjfs42!p(bA
z1a<7`ZN;uC%f_9;xG<7aS*TA*FRw~yI(&g*Ls{NUJ4g@)b~Is3q&;IS2N@6IDh%wX
zV3d=$+i=IJYmO&@%9^68KB^d;hkBh%DgjP&9@Y#^!fs<g_=ExXCj*_|_k(@NERF}^
z5d;70RBEU;Wrp_$@_3Mnr7h=Cei#lMvoXJ&_`DxZhcbWS=QU&7A>-!{5uf+P>1;@n
z|DQ121)0CX7?1r=5`U5gmW^b{dx(Pm2cR6ssLa1a{7b~YKztnMYH%>0$DhvuhNjHt
zadDIy%Hv=#kqPVYr(>Ytn3efFPMsw_kMFttY{&8h@No>xI*)_rh#w_q!S-_-PC^Dv
zna|^HogDJGoBao`|0Uq!HDUcS@&7ygfC7o%H<YG)aUSJuIQ)E__&V`PlqsyU{Ct2P
zC4Q6)xV>Bf>ny(%;Pd}+llY$q+%Wdb^?fD4=W)ix70QC&$Jm~33#;O5#OL_)xJl^B
zr0U=J$8jf`FrUYJyw4N&8TwPu|M#KL&yQ|2rFnFtVx3W<eCD_@@B_#!hl74D<#93Z
z`wZH%&hlr#h0$V}Uml0Us$5KLr?mSz<G+Fe?0f8=$F;nVm-p*(|70HDM|_XQ>t{ai
zJG{9CKuAqWl+Vm(ISVCzzA#N?8&!gz6H0rq{qI15pU>mD2I8AcB*}gmeh)H;ar=3E
zH$?oPJ^J#e_d!97%*#d0$LEVbG2bt_@A>V1%>Jpv;d21LwQ_qnF8DskZTItM+LU%O
YP&xZ$Tl|?-;y=4XS^R?yNe21<1ipxBpa1{>

literal 0
HcmV?d00001

diff --git a/examples/sycl/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/CMakeFiles/Makefile.cmake b/examples/sycl/CMakeFiles/Makefile.cmake
new file mode 100644
index 0000000000000..b7ba0b4b841a3
--- /dev/null
+++ b/examples/sycl/CMakeFiles/Makefile.cmake
@@ -0,0 +1,130 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# The generator used is:
+set(CMAKE_DEPENDS_GENERATOR "Unix Makefiles")
+
+# The top level Makefile was generated from the following files:
+set(CMAKE_MAKEFILE_DEPENDS
+  "CMakeCache.txt"
+  "../CMakeLists.txt"
+  "../baby-llama/CMakeLists.txt"
+  "../batched-bench/CMakeLists.txt"
+  "../batched/CMakeLists.txt"
+  "../beam-search/CMakeLists.txt"
+  "../benchmark/CMakeLists.txt"
+  "../convert-llama2c-to-ggml/CMakeLists.txt"
+  "../embedding/CMakeLists.txt"
+  "../export-lora/CMakeLists.txt"
+  "../finetune/CMakeLists.txt"
+  "../infill/CMakeLists.txt"
+  "../llama-bench/CMakeLists.txt"
+  "../llava/CMakeLists.txt"
+  "../lookahead/CMakeLists.txt"
+  "../lookup/CMakeLists.txt"
+  "../main/CMakeLists.txt"
+  "../parallel/CMakeLists.txt"
+  "../perplexity/CMakeLists.txt"
+  "../quantize-stats/CMakeLists.txt"
+  "../quantize/CMakeLists.txt"
+  "../save-load-state/CMakeLists.txt"
+  "../simple/CMakeLists.txt"
+  "../speculative/CMakeLists.txt"
+  "CMakeFiles/3.22.1/CMakeCCompiler.cmake"
+  "CMakeFiles/3.22.1/CMakeCXXCompiler.cmake"
+  "CMakeFiles/3.22.1/CMakeSystem.cmake"
+  "CMakeLists.txt"
+  "../tokenize/CMakeLists.txt"
+  "../train-text-from-scratch/CMakeLists.txt"
+  "/usr/share/cmake-3.22/Modules/CMakeCInformation.cmake"
+  "/usr/share/cmake-3.22/Modules/CMakeCXXInformation.cmake"
+  "/usr/share/cmake-3.22/Modules/CMakeCommonLanguageInclude.cmake"
+  "/usr/share/cmake-3.22/Modules/CMakeGenericSystem.cmake"
+  "/usr/share/cmake-3.22/Modules/CMakeInitializeConfigs.cmake"
+  "/usr/share/cmake-3.22/Modules/CMakeLanguageInformation.cmake"
+  "/usr/share/cmake-3.22/Modules/CMakeSystemSpecificInformation.cmake"
+  "/usr/share/cmake-3.22/Modules/CMakeSystemSpecificInitialize.cmake"
+  "/usr/share/cmake-3.22/Modules/CheckCSourceCompiles.cmake"
+  "/usr/share/cmake-3.22/Modules/CheckIncludeFile.cmake"
+  "/usr/share/cmake-3.22/Modules/CheckLibraryExists.cmake"
+  "/usr/share/cmake-3.22/Modules/Compiler/CMakeCommonCompilerMacros.cmake"
+  "/usr/share/cmake-3.22/Modules/Compiler/IntelLLVM-C.cmake"
+  "/usr/share/cmake-3.22/Modules/Compiler/IntelLLVM-CXX.cmake"
+  "/usr/share/cmake-3.22/Modules/Compiler/IntelLLVM.cmake"
+  "/usr/share/cmake-3.22/Modules/FindPackageHandleStandardArgs.cmake"
+  "/usr/share/cmake-3.22/Modules/FindPackageMessage.cmake"
+  "/usr/share/cmake-3.22/Modules/FindThreads.cmake"
+  "/usr/share/cmake-3.22/Modules/Internal/CheckSourceCompiles.cmake"
+  "/usr/share/cmake-3.22/Modules/Platform/Linux-IntelLLVM-C.cmake"
+  "/usr/share/cmake-3.22/Modules/Platform/Linux-IntelLLVM-CXX.cmake"
+  "/usr/share/cmake-3.22/Modules/Platform/Linux-IntelLLVM.cmake"
+  "/usr/share/cmake-3.22/Modules/Platform/Linux.cmake"
+  "/usr/share/cmake-3.22/Modules/Platform/UnixPaths.cmake"
+  )
+
+# The corresponding makefile is:
+set(CMAKE_MAKEFILE_OUTPUTS
+  "Makefile"
+  "CMakeFiles/cmake.check_cache"
+  )
+
+# Byproducts of CMake generate step:
+set(CMAKE_MAKEFILE_PRODUCTS
+  "CMakeFiles/CMakeDirectoryInformation.cmake"
+  "baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "batched/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "beam-search/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "benchmark/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "embedding/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "finetune/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "infill/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "llava/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "sycl/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "main/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "tokenize/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "parallel/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "perplexity/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "quantize/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "simple/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "speculative/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "lookahead/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "lookup/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake"
+  "export-lora/CMakeFiles/CMakeDirectoryInformation.cmake"
+  )
+
+# Dependency information for all targets:
+set(CMAKE_DEPEND_INFO_FILES
+  "baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake"
+  "batched/CMakeFiles/batched.dir/DependInfo.cmake"
+  "batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake"
+  "beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake"
+  "benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake"
+  "convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake"
+  "embedding/CMakeFiles/embedding.dir/DependInfo.cmake"
+  "finetune/CMakeFiles/finetune.dir/DependInfo.cmake"
+  "infill/CMakeFiles/infill.dir/DependInfo.cmake"
+  "llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake"
+  "llava/CMakeFiles/llava.dir/DependInfo.cmake"
+  "llava/CMakeFiles/llava_static.dir/DependInfo.cmake"
+  "llava/CMakeFiles/llava-cli.dir/DependInfo.cmake"
+  "sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake"
+  "main/CMakeFiles/main.dir/DependInfo.cmake"
+  "tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake"
+  "parallel/CMakeFiles/parallel.dir/DependInfo.cmake"
+  "perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake"
+  "quantize/CMakeFiles/quantize.dir/DependInfo.cmake"
+  "quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake"
+  "save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake"
+  "simple/CMakeFiles/simple.dir/DependInfo.cmake"
+  "speculative/CMakeFiles/speculative.dir/DependInfo.cmake"
+  "lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake"
+  "lookup/CMakeFiles/lookup.dir/DependInfo.cmake"
+  "train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake"
+  "export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake"
+  )
diff --git a/examples/sycl/CMakeFiles/Makefile2 b/examples/sycl/CMakeFiles/Makefile2
new file mode 100644
index 0000000000000..27a69e727b838
--- /dev/null
+++ b/examples/sycl/CMakeFiles/Makefile2
@@ -0,0 +1,1239 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Directory level rules for the build root directory
+
+# The main recursive "all" target.
+all: baby-llama/all
+all: batched/all
+all: batched-bench/all
+all: beam-search/all
+all: benchmark/all
+all: convert-llama2c-to-ggml/all
+all: embedding/all
+all: finetune/all
+all: infill/all
+all: llama-bench/all
+all: llava/all
+all: sycl/all
+all: main/all
+all: tokenize/all
+all: parallel/all
+all: perplexity/all
+all: quantize/all
+all: quantize-stats/all
+all: save-load-state/all
+all: simple/all
+all: speculative/all
+all: lookahead/all
+all: lookup/all
+all: train-text-from-scratch/all
+all: export-lora/all
+.PHONY : all
+
+# The main recursive "preinstall" target.
+preinstall: baby-llama/preinstall
+preinstall: batched/preinstall
+preinstall: batched-bench/preinstall
+preinstall: beam-search/preinstall
+preinstall: benchmark/preinstall
+preinstall: convert-llama2c-to-ggml/preinstall
+preinstall: embedding/preinstall
+preinstall: finetune/preinstall
+preinstall: infill/preinstall
+preinstall: llama-bench/preinstall
+preinstall: llava/preinstall
+preinstall: sycl/preinstall
+preinstall: main/preinstall
+preinstall: tokenize/preinstall
+preinstall: parallel/preinstall
+preinstall: perplexity/preinstall
+preinstall: quantize/preinstall
+preinstall: quantize-stats/preinstall
+preinstall: save-load-state/preinstall
+preinstall: simple/preinstall
+preinstall: speculative/preinstall
+preinstall: lookahead/preinstall
+preinstall: lookup/preinstall
+preinstall: train-text-from-scratch/preinstall
+preinstall: export-lora/preinstall
+.PHONY : preinstall
+
+# The main recursive "clean" target.
+clean: baby-llama/clean
+clean: batched/clean
+clean: batched-bench/clean
+clean: beam-search/clean
+clean: benchmark/clean
+clean: convert-llama2c-to-ggml/clean
+clean: embedding/clean
+clean: finetune/clean
+clean: infill/clean
+clean: llama-bench/clean
+clean: llava/clean
+clean: sycl/clean
+clean: main/clean
+clean: tokenize/clean
+clean: parallel/clean
+clean: perplexity/clean
+clean: quantize/clean
+clean: quantize-stats/clean
+clean: save-load-state/clean
+clean: simple/clean
+clean: speculative/clean
+clean: lookahead/clean
+clean: lookup/clean
+clean: train-text-from-scratch/clean
+clean: export-lora/clean
+.PHONY : clean
+
+#=============================================================================
+# Directory level rules for directory baby-llama
+
+# Recursive "all" directory target.
+baby-llama/all: baby-llama/CMakeFiles/baby-llama.dir/all
+.PHONY : baby-llama/all
+
+# Recursive "preinstall" directory target.
+baby-llama/preinstall:
+.PHONY : baby-llama/preinstall
+
+# Recursive "clean" directory target.
+baby-llama/clean: baby-llama/CMakeFiles/baby-llama.dir/clean
+.PHONY : baby-llama/clean
+
+#=============================================================================
+# Directory level rules for directory batched
+
+# Recursive "all" directory target.
+batched/all: batched/CMakeFiles/batched.dir/all
+.PHONY : batched/all
+
+# Recursive "preinstall" directory target.
+batched/preinstall:
+.PHONY : batched/preinstall
+
+# Recursive "clean" directory target.
+batched/clean: batched/CMakeFiles/batched.dir/clean
+.PHONY : batched/clean
+
+#=============================================================================
+# Directory level rules for directory batched-bench
+
+# Recursive "all" directory target.
+batched-bench/all: batched-bench/CMakeFiles/batched-bench.dir/all
+.PHONY : batched-bench/all
+
+# Recursive "preinstall" directory target.
+batched-bench/preinstall:
+.PHONY : batched-bench/preinstall
+
+# Recursive "clean" directory target.
+batched-bench/clean: batched-bench/CMakeFiles/batched-bench.dir/clean
+.PHONY : batched-bench/clean
+
+#=============================================================================
+# Directory level rules for directory beam-search
+
+# Recursive "all" directory target.
+beam-search/all: beam-search/CMakeFiles/beam-search.dir/all
+.PHONY : beam-search/all
+
+# Recursive "preinstall" directory target.
+beam-search/preinstall:
+.PHONY : beam-search/preinstall
+
+# Recursive "clean" directory target.
+beam-search/clean: beam-search/CMakeFiles/beam-search.dir/clean
+.PHONY : beam-search/clean
+
+#=============================================================================
+# Directory level rules for directory benchmark
+
+# Recursive "all" directory target.
+benchmark/all: benchmark/CMakeFiles/benchmark.dir/all
+.PHONY : benchmark/all
+
+# Recursive "preinstall" directory target.
+benchmark/preinstall:
+.PHONY : benchmark/preinstall
+
+# Recursive "clean" directory target.
+benchmark/clean: benchmark/CMakeFiles/benchmark.dir/clean
+.PHONY : benchmark/clean
+
+#=============================================================================
+# Directory level rules for directory convert-llama2c-to-ggml
+
+# Recursive "all" directory target.
+convert-llama2c-to-ggml/all: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/all
+.PHONY : convert-llama2c-to-ggml/all
+
+# Recursive "preinstall" directory target.
+convert-llama2c-to-ggml/preinstall:
+.PHONY : convert-llama2c-to-ggml/preinstall
+
+# Recursive "clean" directory target.
+convert-llama2c-to-ggml/clean: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean
+.PHONY : convert-llama2c-to-ggml/clean
+
+#=============================================================================
+# Directory level rules for directory embedding
+
+# Recursive "all" directory target.
+embedding/all: embedding/CMakeFiles/embedding.dir/all
+.PHONY : embedding/all
+
+# Recursive "preinstall" directory target.
+embedding/preinstall:
+.PHONY : embedding/preinstall
+
+# Recursive "clean" directory target.
+embedding/clean: embedding/CMakeFiles/embedding.dir/clean
+.PHONY : embedding/clean
+
+#=============================================================================
+# Directory level rules for directory export-lora
+
+# Recursive "all" directory target.
+export-lora/all: export-lora/CMakeFiles/export-lora.dir/all
+.PHONY : export-lora/all
+
+# Recursive "preinstall" directory target.
+export-lora/preinstall:
+.PHONY : export-lora/preinstall
+
+# Recursive "clean" directory target.
+export-lora/clean: export-lora/CMakeFiles/export-lora.dir/clean
+.PHONY : export-lora/clean
+
+#=============================================================================
+# Directory level rules for directory finetune
+
+# Recursive "all" directory target.
+finetune/all: finetune/CMakeFiles/finetune.dir/all
+.PHONY : finetune/all
+
+# Recursive "preinstall" directory target.
+finetune/preinstall:
+.PHONY : finetune/preinstall
+
+# Recursive "clean" directory target.
+finetune/clean: finetune/CMakeFiles/finetune.dir/clean
+.PHONY : finetune/clean
+
+#=============================================================================
+# Directory level rules for directory infill
+
+# Recursive "all" directory target.
+infill/all: infill/CMakeFiles/infill.dir/all
+.PHONY : infill/all
+
+# Recursive "preinstall" directory target.
+infill/preinstall:
+.PHONY : infill/preinstall
+
+# Recursive "clean" directory target.
+infill/clean: infill/CMakeFiles/infill.dir/clean
+.PHONY : infill/clean
+
+#=============================================================================
+# Directory level rules for directory llama-bench
+
+# Recursive "all" directory target.
+llama-bench/all: llama-bench/CMakeFiles/llama-bench.dir/all
+.PHONY : llama-bench/all
+
+# Recursive "preinstall" directory target.
+llama-bench/preinstall:
+.PHONY : llama-bench/preinstall
+
+# Recursive "clean" directory target.
+llama-bench/clean: llama-bench/CMakeFiles/llama-bench.dir/clean
+.PHONY : llama-bench/clean
+
+#=============================================================================
+# Directory level rules for directory llava
+
+# Recursive "all" directory target.
+llava/all: llava/CMakeFiles/llava.dir/all
+llava/all: llava/CMakeFiles/llava_static.dir/all
+llava/all: llava/CMakeFiles/llava-cli.dir/all
+.PHONY : llava/all
+
+# Recursive "preinstall" directory target.
+llava/preinstall:
+.PHONY : llava/preinstall
+
+# Recursive "clean" directory target.
+llava/clean: llava/CMakeFiles/llava.dir/clean
+llava/clean: llava/CMakeFiles/llava_static.dir/clean
+llava/clean: llava/CMakeFiles/llava-cli.dir/clean
+.PHONY : llava/clean
+
+#=============================================================================
+# Directory level rules for directory lookahead
+
+# Recursive "all" directory target.
+lookahead/all: lookahead/CMakeFiles/lookahead.dir/all
+.PHONY : lookahead/all
+
+# Recursive "preinstall" directory target.
+lookahead/preinstall:
+.PHONY : lookahead/preinstall
+
+# Recursive "clean" directory target.
+lookahead/clean: lookahead/CMakeFiles/lookahead.dir/clean
+.PHONY : lookahead/clean
+
+#=============================================================================
+# Directory level rules for directory lookup
+
+# Recursive "all" directory target.
+lookup/all: lookup/CMakeFiles/lookup.dir/all
+.PHONY : lookup/all
+
+# Recursive "preinstall" directory target.
+lookup/preinstall:
+.PHONY : lookup/preinstall
+
+# Recursive "clean" directory target.
+lookup/clean: lookup/CMakeFiles/lookup.dir/clean
+.PHONY : lookup/clean
+
+#=============================================================================
+# Directory level rules for directory main
+
+# Recursive "all" directory target.
+main/all: main/CMakeFiles/main.dir/all
+.PHONY : main/all
+
+# Recursive "preinstall" directory target.
+main/preinstall:
+.PHONY : main/preinstall
+
+# Recursive "clean" directory target.
+main/clean: main/CMakeFiles/main.dir/clean
+.PHONY : main/clean
+
+#=============================================================================
+# Directory level rules for directory parallel
+
+# Recursive "all" directory target.
+parallel/all: parallel/CMakeFiles/parallel.dir/all
+.PHONY : parallel/all
+
+# Recursive "preinstall" directory target.
+parallel/preinstall:
+.PHONY : parallel/preinstall
+
+# Recursive "clean" directory target.
+parallel/clean: parallel/CMakeFiles/parallel.dir/clean
+.PHONY : parallel/clean
+
+#=============================================================================
+# Directory level rules for directory perplexity
+
+# Recursive "all" directory target.
+perplexity/all: perplexity/CMakeFiles/perplexity.dir/all
+.PHONY : perplexity/all
+
+# Recursive "preinstall" directory target.
+perplexity/preinstall:
+.PHONY : perplexity/preinstall
+
+# Recursive "clean" directory target.
+perplexity/clean: perplexity/CMakeFiles/perplexity.dir/clean
+.PHONY : perplexity/clean
+
+#=============================================================================
+# Directory level rules for directory quantize
+
+# Recursive "all" directory target.
+quantize/all: quantize/CMakeFiles/quantize.dir/all
+.PHONY : quantize/all
+
+# Recursive "preinstall" directory target.
+quantize/preinstall:
+.PHONY : quantize/preinstall
+
+# Recursive "clean" directory target.
+quantize/clean: quantize/CMakeFiles/quantize.dir/clean
+.PHONY : quantize/clean
+
+#=============================================================================
+# Directory level rules for directory quantize-stats
+
+# Recursive "all" directory target.
+quantize-stats/all: quantize-stats/CMakeFiles/quantize-stats.dir/all
+.PHONY : quantize-stats/all
+
+# Recursive "preinstall" directory target.
+quantize-stats/preinstall:
+.PHONY : quantize-stats/preinstall
+
+# Recursive "clean" directory target.
+quantize-stats/clean: quantize-stats/CMakeFiles/quantize-stats.dir/clean
+.PHONY : quantize-stats/clean
+
+#=============================================================================
+# Directory level rules for directory save-load-state
+
+# Recursive "all" directory target.
+save-load-state/all: save-load-state/CMakeFiles/save-load-state.dir/all
+.PHONY : save-load-state/all
+
+# Recursive "preinstall" directory target.
+save-load-state/preinstall:
+.PHONY : save-load-state/preinstall
+
+# Recursive "clean" directory target.
+save-load-state/clean: save-load-state/CMakeFiles/save-load-state.dir/clean
+.PHONY : save-load-state/clean
+
+#=============================================================================
+# Directory level rules for directory simple
+
+# Recursive "all" directory target.
+simple/all: simple/CMakeFiles/simple.dir/all
+.PHONY : simple/all
+
+# Recursive "preinstall" directory target.
+simple/preinstall:
+.PHONY : simple/preinstall
+
+# Recursive "clean" directory target.
+simple/clean: simple/CMakeFiles/simple.dir/clean
+.PHONY : simple/clean
+
+#=============================================================================
+# Directory level rules for directory speculative
+
+# Recursive "all" directory target.
+speculative/all: speculative/CMakeFiles/speculative.dir/all
+.PHONY : speculative/all
+
+# Recursive "preinstall" directory target.
+speculative/preinstall:
+.PHONY : speculative/preinstall
+
+# Recursive "clean" directory target.
+speculative/clean: speculative/CMakeFiles/speculative.dir/clean
+.PHONY : speculative/clean
+
+#=============================================================================
+# Directory level rules for directory sycl
+
+# Recursive "all" directory target.
+sycl/all: sycl/CMakeFiles/ls-sycl-device.dir/all
+.PHONY : sycl/all
+
+# Recursive "preinstall" directory target.
+sycl/preinstall:
+.PHONY : sycl/preinstall
+
+# Recursive "clean" directory target.
+sycl/clean: sycl/CMakeFiles/ls-sycl-device.dir/clean
+.PHONY : sycl/clean
+
+#=============================================================================
+# Directory level rules for directory tokenize
+
+# Recursive "all" directory target.
+tokenize/all: tokenize/CMakeFiles/tokenize.dir/all
+.PHONY : tokenize/all
+
+# Recursive "preinstall" directory target.
+tokenize/preinstall:
+.PHONY : tokenize/preinstall
+
+# Recursive "clean" directory target.
+tokenize/clean: tokenize/CMakeFiles/tokenize.dir/clean
+.PHONY : tokenize/clean
+
+#=============================================================================
+# Directory level rules for directory train-text-from-scratch
+
+# Recursive "all" directory target.
+train-text-from-scratch/all: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/all
+.PHONY : train-text-from-scratch/all
+
+# Recursive "preinstall" directory target.
+train-text-from-scratch/preinstall:
+.PHONY : train-text-from-scratch/preinstall
+
+# Recursive "clean" directory target.
+train-text-from-scratch/clean: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean
+.PHONY : train-text-from-scratch/clean
+
+#=============================================================================
+# Target rules for target baby-llama/CMakeFiles/baby-llama.dir
+
+# All Build rule for target.
+baby-llama/CMakeFiles/baby-llama.dir/all:
+	$(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/depend
+	$(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=1,2 "Built target baby-llama"
+.PHONY : baby-llama/CMakeFiles/baby-llama.dir/all
+
+# Build rule for subdir invocation for target.
+baby-llama/CMakeFiles/baby-llama.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/CMakeFiles/baby-llama.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : baby-llama/CMakeFiles/baby-llama.dir/rule
+
+# Convenience name for target.
+baby-llama: baby-llama/CMakeFiles/baby-llama.dir/rule
+.PHONY : baby-llama
+
+# clean rule for target.
+baby-llama/CMakeFiles/baby-llama.dir/clean:
+	$(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/clean
+.PHONY : baby-llama/CMakeFiles/baby-llama.dir/clean
+
+#=============================================================================
+# Target rules for target batched/CMakeFiles/batched.dir
+
+# All Build rule for target.
+batched/CMakeFiles/batched.dir/all:
+	$(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/depend
+	$(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=3,4 "Built target batched"
+.PHONY : batched/CMakeFiles/batched.dir/all
+
+# Build rule for subdir invocation for target.
+batched/CMakeFiles/batched.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/CMakeFiles/batched.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : batched/CMakeFiles/batched.dir/rule
+
+# Convenience name for target.
+batched: batched/CMakeFiles/batched.dir/rule
+.PHONY : batched
+
+# clean rule for target.
+batched/CMakeFiles/batched.dir/clean:
+	$(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/clean
+.PHONY : batched/CMakeFiles/batched.dir/clean
+
+#=============================================================================
+# Target rules for target batched-bench/CMakeFiles/batched-bench.dir
+
+# All Build rule for target.
+batched-bench/CMakeFiles/batched-bench.dir/all:
+	$(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/depend
+	$(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=5,6 "Built target batched-bench"
+.PHONY : batched-bench/CMakeFiles/batched-bench.dir/all
+
+# Build rule for subdir invocation for target.
+batched-bench/CMakeFiles/batched-bench.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/CMakeFiles/batched-bench.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : batched-bench/CMakeFiles/batched-bench.dir/rule
+
+# Convenience name for target.
+batched-bench: batched-bench/CMakeFiles/batched-bench.dir/rule
+.PHONY : batched-bench
+
+# clean rule for target.
+batched-bench/CMakeFiles/batched-bench.dir/clean:
+	$(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/clean
+.PHONY : batched-bench/CMakeFiles/batched-bench.dir/clean
+
+#=============================================================================
+# Target rules for target beam-search/CMakeFiles/beam-search.dir
+
+# All Build rule for target.
+beam-search/CMakeFiles/beam-search.dir/all:
+	$(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/depend
+	$(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=7,8 "Built target beam-search"
+.PHONY : beam-search/CMakeFiles/beam-search.dir/all
+
+# Build rule for subdir invocation for target.
+beam-search/CMakeFiles/beam-search.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/CMakeFiles/beam-search.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : beam-search/CMakeFiles/beam-search.dir/rule
+
+# Convenience name for target.
+beam-search: beam-search/CMakeFiles/beam-search.dir/rule
+.PHONY : beam-search
+
+# clean rule for target.
+beam-search/CMakeFiles/beam-search.dir/clean:
+	$(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/clean
+.PHONY : beam-search/CMakeFiles/beam-search.dir/clean
+
+#=============================================================================
+# Target rules for target benchmark/CMakeFiles/benchmark.dir
+
+# All Build rule for target.
+benchmark/CMakeFiles/benchmark.dir/all:
+	$(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/depend
+	$(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=9,10 "Built target benchmark"
+.PHONY : benchmark/CMakeFiles/benchmark.dir/all
+
+# Build rule for subdir invocation for target.
+benchmark/CMakeFiles/benchmark.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/CMakeFiles/benchmark.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : benchmark/CMakeFiles/benchmark.dir/rule
+
+# Convenience name for target.
+benchmark: benchmark/CMakeFiles/benchmark.dir/rule
+.PHONY : benchmark
+
+# clean rule for target.
+benchmark/CMakeFiles/benchmark.dir/clean:
+	$(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/clean
+.PHONY : benchmark/CMakeFiles/benchmark.dir/clean
+
+#=============================================================================
+# Target rules for target convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir
+
+# All Build rule for target.
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/all:
+	$(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend
+	$(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=11,12 "Built target convert-llama2c-to-ggml"
+.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/all
+
+# Build rule for subdir invocation for target.
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
+
+# Convenience name for target.
+convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
+.PHONY : convert-llama2c-to-ggml
+
+# clean rule for target.
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean:
+	$(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean
+.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean
+
+#=============================================================================
+# Target rules for target embedding/CMakeFiles/embedding.dir
+
+# All Build rule for target.
+embedding/CMakeFiles/embedding.dir/all:
+	$(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/depend
+	$(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=13,14 "Built target embedding"
+.PHONY : embedding/CMakeFiles/embedding.dir/all
+
+# Build rule for subdir invocation for target.
+embedding/CMakeFiles/embedding.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/CMakeFiles/embedding.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : embedding/CMakeFiles/embedding.dir/rule
+
+# Convenience name for target.
+embedding: embedding/CMakeFiles/embedding.dir/rule
+.PHONY : embedding
+
+# clean rule for target.
+embedding/CMakeFiles/embedding.dir/clean:
+	$(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/clean
+.PHONY : embedding/CMakeFiles/embedding.dir/clean
+
+#=============================================================================
+# Target rules for target finetune/CMakeFiles/finetune.dir
+
+# All Build rule for target.
+finetune/CMakeFiles/finetune.dir/all:
+	$(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/depend
+	$(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=17,18 "Built target finetune"
+.PHONY : finetune/CMakeFiles/finetune.dir/all
+
+# Build rule for subdir invocation for target.
+finetune/CMakeFiles/finetune.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/CMakeFiles/finetune.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : finetune/CMakeFiles/finetune.dir/rule
+
+# Convenience name for target.
+finetune: finetune/CMakeFiles/finetune.dir/rule
+.PHONY : finetune
+
+# clean rule for target.
+finetune/CMakeFiles/finetune.dir/clean:
+	$(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/clean
+.PHONY : finetune/CMakeFiles/finetune.dir/clean
+
+#=============================================================================
+# Target rules for target infill/CMakeFiles/infill.dir
+
+# All Build rule for target.
+infill/CMakeFiles/infill.dir/all:
+	$(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/depend
+	$(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=19,20 "Built target infill"
+.PHONY : infill/CMakeFiles/infill.dir/all
+
+# Build rule for subdir invocation for target.
+infill/CMakeFiles/infill.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/CMakeFiles/infill.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : infill/CMakeFiles/infill.dir/rule
+
+# Convenience name for target.
+infill: infill/CMakeFiles/infill.dir/rule
+.PHONY : infill
+
+# clean rule for target.
+infill/CMakeFiles/infill.dir/clean:
+	$(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/clean
+.PHONY : infill/CMakeFiles/infill.dir/clean
+
+#=============================================================================
+# Target rules for target llama-bench/CMakeFiles/llama-bench.dir
+
+# All Build rule for target.
+llama-bench/CMakeFiles/llama-bench.dir/all:
+	$(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/depend
+	$(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=21,22 "Built target llama-bench"
+.PHONY : llama-bench/CMakeFiles/llama-bench.dir/all
+
+# Build rule for subdir invocation for target.
+llama-bench/CMakeFiles/llama-bench.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/CMakeFiles/llama-bench.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : llama-bench/CMakeFiles/llama-bench.dir/rule
+
+# Convenience name for target.
+llama-bench: llama-bench/CMakeFiles/llama-bench.dir/rule
+.PHONY : llama-bench
+
+# clean rule for target.
+llama-bench/CMakeFiles/llama-bench.dir/clean:
+	$(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/clean
+.PHONY : llama-bench/CMakeFiles/llama-bench.dir/clean
+
+#=============================================================================
+# Target rules for target llava/CMakeFiles/llava.dir
+
+# All Build rule for target.
+llava/CMakeFiles/llava.dir/all:
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/depend
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=23,24 "Built target llava"
+.PHONY : llava/CMakeFiles/llava.dir/all
+
+# Build rule for subdir invocation for target.
+llava/CMakeFiles/llava.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : llava/CMakeFiles/llava.dir/rule
+
+# Convenience name for target.
+llava: llava/CMakeFiles/llava.dir/rule
+.PHONY : llava
+
+# clean rule for target.
+llava/CMakeFiles/llava.dir/clean:
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/clean
+.PHONY : llava/CMakeFiles/llava.dir/clean
+
+#=============================================================================
+# Target rules for target llava/CMakeFiles/llava_static.dir
+
+# All Build rule for target.
+llava/CMakeFiles/llava_static.dir/all: llava/CMakeFiles/llava.dir/all
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/depend
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=27 "Built target llava_static"
+.PHONY : llava/CMakeFiles/llava_static.dir/all
+
+# Build rule for subdir invocation for target.
+llava/CMakeFiles/llava_static.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 3
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava_static.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : llava/CMakeFiles/llava_static.dir/rule
+
+# Convenience name for target.
+llava_static: llava/CMakeFiles/llava_static.dir/rule
+.PHONY : llava_static
+
+# clean rule for target.
+llava/CMakeFiles/llava_static.dir/clean:
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/clean
+.PHONY : llava/CMakeFiles/llava_static.dir/clean
+
+#=============================================================================
+# Target rules for target llava/CMakeFiles/llava-cli.dir
+
+# All Build rule for target.
+llava/CMakeFiles/llava-cli.dir/all: llava/CMakeFiles/llava.dir/all
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/depend
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=25,26 "Built target llava-cli"
+.PHONY : llava/CMakeFiles/llava-cli.dir/all
+
+# Build rule for subdir invocation for target.
+llava/CMakeFiles/llava-cli.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 4
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava-cli.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : llava/CMakeFiles/llava-cli.dir/rule
+
+# Convenience name for target.
+llava-cli: llava/CMakeFiles/llava-cli.dir/rule
+.PHONY : llava-cli
+
+# clean rule for target.
+llava/CMakeFiles/llava-cli.dir/clean:
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/clean
+.PHONY : llava/CMakeFiles/llava-cli.dir/clean
+
+#=============================================================================
+# Target rules for target sycl/CMakeFiles/ls-sycl-device.dir
+
+# All Build rule for target.
+sycl/CMakeFiles/ls-sycl-device.dir/all:
+	$(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/depend
+	$(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=32,33 "Built target ls-sycl-device"
+.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/all
+
+# Build rule for subdir invocation for target.
+sycl/CMakeFiles/ls-sycl-device.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/CMakeFiles/ls-sycl-device.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/rule
+
+# Convenience name for target.
+ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/rule
+.PHONY : ls-sycl-device
+
+# clean rule for target.
+sycl/CMakeFiles/ls-sycl-device.dir/clean:
+	$(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/clean
+.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/clean
+
+#=============================================================================
+# Target rules for target main/CMakeFiles/main.dir
+
+# All Build rule for target.
+main/CMakeFiles/main.dir/all:
+	$(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/depend
+	$(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=34,35 "Built target main"
+.PHONY : main/CMakeFiles/main.dir/all
+
+# Build rule for subdir invocation for target.
+main/CMakeFiles/main.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/CMakeFiles/main.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : main/CMakeFiles/main.dir/rule
+
+# Convenience name for target.
+main: main/CMakeFiles/main.dir/rule
+.PHONY : main
+
+# clean rule for target.
+main/CMakeFiles/main.dir/clean:
+	$(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/clean
+.PHONY : main/CMakeFiles/main.dir/clean
+
+#=============================================================================
+# Target rules for target tokenize/CMakeFiles/tokenize.dir
+
+# All Build rule for target.
+tokenize/CMakeFiles/tokenize.dir/all:
+	$(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/depend
+	$(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=50,51 "Built target tokenize"
+.PHONY : tokenize/CMakeFiles/tokenize.dir/all
+
+# Build rule for subdir invocation for target.
+tokenize/CMakeFiles/tokenize.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/CMakeFiles/tokenize.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : tokenize/CMakeFiles/tokenize.dir/rule
+
+# Convenience name for target.
+tokenize: tokenize/CMakeFiles/tokenize.dir/rule
+.PHONY : tokenize
+
+# clean rule for target.
+tokenize/CMakeFiles/tokenize.dir/clean:
+	$(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/clean
+.PHONY : tokenize/CMakeFiles/tokenize.dir/clean
+
+#=============================================================================
+# Target rules for target parallel/CMakeFiles/parallel.dir
+
+# All Build rule for target.
+parallel/CMakeFiles/parallel.dir/all:
+	$(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/depend
+	$(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=36,37 "Built target parallel"
+.PHONY : parallel/CMakeFiles/parallel.dir/all
+
+# Build rule for subdir invocation for target.
+parallel/CMakeFiles/parallel.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/CMakeFiles/parallel.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : parallel/CMakeFiles/parallel.dir/rule
+
+# Convenience name for target.
+parallel: parallel/CMakeFiles/parallel.dir/rule
+.PHONY : parallel
+
+# clean rule for target.
+parallel/CMakeFiles/parallel.dir/clean:
+	$(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/clean
+.PHONY : parallel/CMakeFiles/parallel.dir/clean
+
+#=============================================================================
+# Target rules for target perplexity/CMakeFiles/perplexity.dir
+
+# All Build rule for target.
+perplexity/CMakeFiles/perplexity.dir/all:
+	$(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/depend
+	$(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=38,39 "Built target perplexity"
+.PHONY : perplexity/CMakeFiles/perplexity.dir/all
+
+# Build rule for subdir invocation for target.
+perplexity/CMakeFiles/perplexity.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/CMakeFiles/perplexity.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : perplexity/CMakeFiles/perplexity.dir/rule
+
+# Convenience name for target.
+perplexity: perplexity/CMakeFiles/perplexity.dir/rule
+.PHONY : perplexity
+
+# clean rule for target.
+perplexity/CMakeFiles/perplexity.dir/clean:
+	$(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/clean
+.PHONY : perplexity/CMakeFiles/perplexity.dir/clean
+
+#=============================================================================
+# Target rules for target quantize/CMakeFiles/quantize.dir
+
+# All Build rule for target.
+quantize/CMakeFiles/quantize.dir/all:
+	$(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/depend
+	$(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=40,41 "Built target quantize"
+.PHONY : quantize/CMakeFiles/quantize.dir/all
+
+# Build rule for subdir invocation for target.
+quantize/CMakeFiles/quantize.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/CMakeFiles/quantize.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : quantize/CMakeFiles/quantize.dir/rule
+
+# Convenience name for target.
+quantize: quantize/CMakeFiles/quantize.dir/rule
+.PHONY : quantize
+
+# clean rule for target.
+quantize/CMakeFiles/quantize.dir/clean:
+	$(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/clean
+.PHONY : quantize/CMakeFiles/quantize.dir/clean
+
+#=============================================================================
+# Target rules for target quantize-stats/CMakeFiles/quantize-stats.dir
+
+# All Build rule for target.
+quantize-stats/CMakeFiles/quantize-stats.dir/all:
+	$(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/depend
+	$(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=42,43 "Built target quantize-stats"
+.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/all
+
+# Build rule for subdir invocation for target.
+quantize-stats/CMakeFiles/quantize-stats.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/CMakeFiles/quantize-stats.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/rule
+
+# Convenience name for target.
+quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/rule
+.PHONY : quantize-stats
+
+# clean rule for target.
+quantize-stats/CMakeFiles/quantize-stats.dir/clean:
+	$(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/clean
+.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/clean
+
+#=============================================================================
+# Target rules for target save-load-state/CMakeFiles/save-load-state.dir
+
+# All Build rule for target.
+save-load-state/CMakeFiles/save-load-state.dir/all:
+	$(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/depend
+	$(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=44,45 "Built target save-load-state"
+.PHONY : save-load-state/CMakeFiles/save-load-state.dir/all
+
+# Build rule for subdir invocation for target.
+save-load-state/CMakeFiles/save-load-state.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/CMakeFiles/save-load-state.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : save-load-state/CMakeFiles/save-load-state.dir/rule
+
+# Convenience name for target.
+save-load-state: save-load-state/CMakeFiles/save-load-state.dir/rule
+.PHONY : save-load-state
+
+# clean rule for target.
+save-load-state/CMakeFiles/save-load-state.dir/clean:
+	$(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/clean
+.PHONY : save-load-state/CMakeFiles/save-load-state.dir/clean
+
+#=============================================================================
+# Target rules for target simple/CMakeFiles/simple.dir
+
+# All Build rule for target.
+simple/CMakeFiles/simple.dir/all:
+	$(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/depend
+	$(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=46,47 "Built target simple"
+.PHONY : simple/CMakeFiles/simple.dir/all
+
+# Build rule for subdir invocation for target.
+simple/CMakeFiles/simple.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/CMakeFiles/simple.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : simple/CMakeFiles/simple.dir/rule
+
+# Convenience name for target.
+simple: simple/CMakeFiles/simple.dir/rule
+.PHONY : simple
+
+# clean rule for target.
+simple/CMakeFiles/simple.dir/clean:
+	$(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/clean
+.PHONY : simple/CMakeFiles/simple.dir/clean
+
+#=============================================================================
+# Target rules for target speculative/CMakeFiles/speculative.dir
+
+# All Build rule for target.
+speculative/CMakeFiles/speculative.dir/all:
+	$(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/depend
+	$(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=48,49 "Built target speculative"
+.PHONY : speculative/CMakeFiles/speculative.dir/all
+
+# Build rule for subdir invocation for target.
+speculative/CMakeFiles/speculative.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/CMakeFiles/speculative.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : speculative/CMakeFiles/speculative.dir/rule
+
+# Convenience name for target.
+speculative: speculative/CMakeFiles/speculative.dir/rule
+.PHONY : speculative
+
+# clean rule for target.
+speculative/CMakeFiles/speculative.dir/clean:
+	$(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/clean
+.PHONY : speculative/CMakeFiles/speculative.dir/clean
+
+#=============================================================================
+# Target rules for target lookahead/CMakeFiles/lookahead.dir
+
+# All Build rule for target.
+lookahead/CMakeFiles/lookahead.dir/all:
+	$(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/depend
+	$(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=28,29 "Built target lookahead"
+.PHONY : lookahead/CMakeFiles/lookahead.dir/all
+
+# Build rule for subdir invocation for target.
+lookahead/CMakeFiles/lookahead.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/CMakeFiles/lookahead.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : lookahead/CMakeFiles/lookahead.dir/rule
+
+# Convenience name for target.
+lookahead: lookahead/CMakeFiles/lookahead.dir/rule
+.PHONY : lookahead
+
+# clean rule for target.
+lookahead/CMakeFiles/lookahead.dir/clean:
+	$(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/clean
+.PHONY : lookahead/CMakeFiles/lookahead.dir/clean
+
+#=============================================================================
+# Target rules for target lookup/CMakeFiles/lookup.dir
+
+# All Build rule for target.
+lookup/CMakeFiles/lookup.dir/all:
+	$(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/depend
+	$(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=30,31 "Built target lookup"
+.PHONY : lookup/CMakeFiles/lookup.dir/all
+
+# Build rule for subdir invocation for target.
+lookup/CMakeFiles/lookup.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/CMakeFiles/lookup.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : lookup/CMakeFiles/lookup.dir/rule
+
+# Convenience name for target.
+lookup: lookup/CMakeFiles/lookup.dir/rule
+.PHONY : lookup
+
+# clean rule for target.
+lookup/CMakeFiles/lookup.dir/clean:
+	$(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/clean
+.PHONY : lookup/CMakeFiles/lookup.dir/clean
+
+#=============================================================================
+# Target rules for target train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir
+
+# All Build rule for target.
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/all:
+	$(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend
+	$(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=52,53 "Built target train-text-from-scratch"
+.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/all
+
+# Build rule for subdir invocation for target.
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
+
+# Convenience name for target.
+train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
+.PHONY : train-text-from-scratch
+
+# clean rule for target.
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean:
+	$(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean
+.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean
+
+#=============================================================================
+# Target rules for target export-lora/CMakeFiles/export-lora.dir
+
+# All Build rule for target.
+export-lora/CMakeFiles/export-lora.dir/all:
+	$(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/depend
+	$(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/build
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=15,16 "Built target export-lora"
+.PHONY : export-lora/CMakeFiles/export-lora.dir/all
+
+# Build rule for subdir invocation for target.
+export-lora/CMakeFiles/export-lora.dir/rule: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/CMakeFiles/export-lora.dir/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : export-lora/CMakeFiles/export-lora.dir/rule
+
+# Convenience name for target.
+export-lora: export-lora/CMakeFiles/export-lora.dir/rule
+.PHONY : export-lora
+
+# clean rule for target.
+export-lora/CMakeFiles/export-lora.dir/clean:
+	$(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/clean
+.PHONY : export-lora/CMakeFiles/export-lora.dir/clean
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/CMakeFiles/Progress/1 b/examples/sycl/CMakeFiles/Progress/1
new file mode 100644
index 0000000000000..7b4d68d70fcae
--- /dev/null
+++ b/examples/sycl/CMakeFiles/Progress/1
@@ -0,0 +1 @@
+empty
\ No newline at end of file
diff --git a/examples/sycl/CMakeFiles/Progress/count.txt b/examples/sycl/CMakeFiles/Progress/count.txt
new file mode 100644
index 0000000000000..59343b09ec765
--- /dev/null
+++ b/examples/sycl/CMakeFiles/Progress/count.txt
@@ -0,0 +1 @@
+53
diff --git a/examples/sycl/CMakeFiles/TargetDirectories.txt b/examples/sycl/CMakeFiles/TargetDirectories.txt
new file mode 100644
index 0000000000000..8c32b6fec862f
--- /dev/null
+++ b/examples/sycl/CMakeFiles/TargetDirectories.txt
@@ -0,0 +1,183 @@
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/batched.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/beam-search.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/benchmark.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/embedding.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/finetune.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/infill.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava_static.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava-cli.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/main.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/tokenize.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/parallel.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/perplexity.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/quantize.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/simple.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/speculative.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/lookahead.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/lookup.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/install/strip.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/export-lora.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/edit_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/rebuild_cache.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/list_install_components.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/install.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/install/local.dir
+/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/install/strip.dir
diff --git a/examples/sycl/CMakeFiles/cmake.check_cache b/examples/sycl/CMakeFiles/cmake.check_cache
new file mode 100644
index 0000000000000..3dccd731726d7
--- /dev/null
+++ b/examples/sycl/CMakeFiles/cmake.check_cache
@@ -0,0 +1 @@
+# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/examples/sycl/CMakeFiles/progress.marks b/examples/sycl/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..59343b09ec765
--- /dev/null
+++ b/examples/sycl/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+53
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
new file mode 100644
index 0000000000000..aee37f6a1d49b
--- /dev/null
+++ b/examples/sycl/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET ls-sycl-device)
+add_executable(${TARGET} ls-sycl-device.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
\ No newline at end of file
diff --git a/examples/sycl/Makefile b/examples/sycl/Makefile
new file mode 100644
index 0000000000000..8db0f04b1819b
--- /dev/null
+++ b/examples/sycl/Makefile
@@ -0,0 +1,567 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl//CMakeFiles/progress.marks
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+#=============================================================================
+# Target rules for targets named baby-llama
+
+# Build rule for target.
+baby-llama: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama
+.PHONY : baby-llama
+
+# fast build rule for target.
+baby-llama/fast:
+	$(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/build
+.PHONY : baby-llama/fast
+
+#=============================================================================
+# Target rules for targets named batched
+
+# Build rule for target.
+batched: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched
+.PHONY : batched
+
+# fast build rule for target.
+batched/fast:
+	$(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/build
+.PHONY : batched/fast
+
+#=============================================================================
+# Target rules for targets named batched-bench
+
+# Build rule for target.
+batched-bench: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench
+.PHONY : batched-bench
+
+# fast build rule for target.
+batched-bench/fast:
+	$(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/build
+.PHONY : batched-bench/fast
+
+#=============================================================================
+# Target rules for targets named beam-search
+
+# Build rule for target.
+beam-search: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search
+.PHONY : beam-search
+
+# fast build rule for target.
+beam-search/fast:
+	$(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/build
+.PHONY : beam-search/fast
+
+#=============================================================================
+# Target rules for targets named benchmark
+
+# Build rule for target.
+benchmark: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark
+.PHONY : benchmark
+
+# fast build rule for target.
+benchmark/fast:
+	$(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/build
+.PHONY : benchmark/fast
+
+#=============================================================================
+# Target rules for targets named convert-llama2c-to-ggml
+
+# Build rule for target.
+convert-llama2c-to-ggml: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml
+.PHONY : convert-llama2c-to-ggml
+
+# fast build rule for target.
+convert-llama2c-to-ggml/fast:
+	$(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build
+.PHONY : convert-llama2c-to-ggml/fast
+
+#=============================================================================
+# Target rules for targets named embedding
+
+# Build rule for target.
+embedding: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding
+.PHONY : embedding
+
+# fast build rule for target.
+embedding/fast:
+	$(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/build
+.PHONY : embedding/fast
+
+#=============================================================================
+# Target rules for targets named finetune
+
+# Build rule for target.
+finetune: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune
+.PHONY : finetune
+
+# fast build rule for target.
+finetune/fast:
+	$(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/build
+.PHONY : finetune/fast
+
+#=============================================================================
+# Target rules for targets named infill
+
+# Build rule for target.
+infill: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill
+.PHONY : infill
+
+# fast build rule for target.
+infill/fast:
+	$(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/build
+.PHONY : infill/fast
+
+#=============================================================================
+# Target rules for targets named llama-bench
+
+# Build rule for target.
+llama-bench: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench
+.PHONY : llama-bench
+
+# fast build rule for target.
+llama-bench/fast:
+	$(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/build
+.PHONY : llama-bench/fast
+
+#=============================================================================
+# Target rules for targets named llava
+
+# Build rule for target.
+llava: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava
+.PHONY : llava
+
+# fast build rule for target.
+llava/fast:
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/build
+.PHONY : llava/fast
+
+#=============================================================================
+# Target rules for targets named llava_static
+
+# Build rule for target.
+llava_static: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava_static
+.PHONY : llava_static
+
+# fast build rule for target.
+llava_static/fast:
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/build
+.PHONY : llava_static/fast
+
+#=============================================================================
+# Target rules for targets named llava-cli
+
+# Build rule for target.
+llava-cli: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava-cli
+.PHONY : llava-cli
+
+# fast build rule for target.
+llava-cli/fast:
+	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/build
+.PHONY : llava-cli/fast
+
+#=============================================================================
+# Target rules for targets named ls-sycl-device
+
+# Build rule for target.
+ls-sycl-device: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ls-sycl-device
+.PHONY : ls-sycl-device
+
+# fast build rule for target.
+ls-sycl-device/fast:
+	$(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/build
+.PHONY : ls-sycl-device/fast
+
+#=============================================================================
+# Target rules for targets named main
+
+# Build rule for target.
+main: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main
+.PHONY : main
+
+# fast build rule for target.
+main/fast:
+	$(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/build
+.PHONY : main/fast
+
+#=============================================================================
+# Target rules for targets named tokenize
+
+# Build rule for target.
+tokenize: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize
+.PHONY : tokenize
+
+# fast build rule for target.
+tokenize/fast:
+	$(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/build
+.PHONY : tokenize/fast
+
+#=============================================================================
+# Target rules for targets named parallel
+
+# Build rule for target.
+parallel: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel
+.PHONY : parallel
+
+# fast build rule for target.
+parallel/fast:
+	$(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/build
+.PHONY : parallel/fast
+
+#=============================================================================
+# Target rules for targets named perplexity
+
+# Build rule for target.
+perplexity: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity
+.PHONY : perplexity
+
+# fast build rule for target.
+perplexity/fast:
+	$(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/build
+.PHONY : perplexity/fast
+
+#=============================================================================
+# Target rules for targets named quantize
+
+# Build rule for target.
+quantize: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize
+.PHONY : quantize
+
+# fast build rule for target.
+quantize/fast:
+	$(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/build
+.PHONY : quantize/fast
+
+#=============================================================================
+# Target rules for targets named quantize-stats
+
+# Build rule for target.
+quantize-stats: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats
+.PHONY : quantize-stats
+
+# fast build rule for target.
+quantize-stats/fast:
+	$(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/build
+.PHONY : quantize-stats/fast
+
+#=============================================================================
+# Target rules for targets named save-load-state
+
+# Build rule for target.
+save-load-state: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state
+.PHONY : save-load-state
+
+# fast build rule for target.
+save-load-state/fast:
+	$(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/build
+.PHONY : save-load-state/fast
+
+#=============================================================================
+# Target rules for targets named simple
+
+# Build rule for target.
+simple: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple
+.PHONY : simple
+
+# fast build rule for target.
+simple/fast:
+	$(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/build
+.PHONY : simple/fast
+
+#=============================================================================
+# Target rules for targets named speculative
+
+# Build rule for target.
+speculative: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative
+.PHONY : speculative
+
+# fast build rule for target.
+speculative/fast:
+	$(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/build
+.PHONY : speculative/fast
+
+#=============================================================================
+# Target rules for targets named lookahead
+
+# Build rule for target.
+lookahead: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead
+.PHONY : lookahead
+
+# fast build rule for target.
+lookahead/fast:
+	$(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/build
+.PHONY : lookahead/fast
+
+#=============================================================================
+# Target rules for targets named lookup
+
+# Build rule for target.
+lookup: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup
+.PHONY : lookup
+
+# fast build rule for target.
+lookup/fast:
+	$(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/build
+.PHONY : lookup/fast
+
+#=============================================================================
+# Target rules for targets named train-text-from-scratch
+
+# Build rule for target.
+train-text-from-scratch: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch
+.PHONY : train-text-from-scratch
+
+# fast build rule for target.
+train-text-from-scratch/fast:
+	$(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build
+.PHONY : train-text-from-scratch/fast
+
+#=============================================================================
+# Target rules for targets named export-lora
+
+# Build rule for target.
+export-lora: cmake_check_build_system
+	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora
+.PHONY : export-lora
+
+# fast build rule for target.
+export-lora/fast:
+	$(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/build
+.PHONY : export-lora/fast
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... baby-llama"
+	@echo "... batched"
+	@echo "... batched-bench"
+	@echo "... beam-search"
+	@echo "... benchmark"
+	@echo "... convert-llama2c-to-ggml"
+	@echo "... embedding"
+	@echo "... export-lora"
+	@echo "... finetune"
+	@echo "... infill"
+	@echo "... llama-bench"
+	@echo "... llava"
+	@echo "... llava-cli"
+	@echo "... llava_static"
+	@echo "... lookahead"
+	@echo "... lookup"
+	@echo "... ls-sycl-device"
+	@echo "... main"
+	@echo "... parallel"
+	@echo "... perplexity"
+	@echo "... quantize"
+	@echo "... quantize-stats"
+	@echo "... save-load-state"
+	@echo "... simple"
+	@echo "... speculative"
+	@echo "... tokenize"
+	@echo "... train-text-from-scratch"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/README.md b/examples/sycl/README.md
new file mode 100644
index 0000000000000..6b38bb8a8420a
--- /dev/null
+++ b/examples/sycl/README.md
@@ -0,0 +1,47 @@
+# llama.cpp/example/sycl
+
+This example program provide the tools for llama.cpp for SYCL on Intel GPU.
+
+## Tool
+
+|Tool Name| Function|Status|
+|-|-|-|
+|ls-sycl-device| List all SYCL devices with ID, compute capability, max work group size, ect.|Support|
+
+### ls-sycl-device
+
+List all SYCL devices with ID, compute capability, max work group size, ect.
+
+1. Build the llama.cpp for SYCL for all targets.
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. Execute
+
+```
+./build/bin/ls-sycl-device
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+	max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+	max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
diff --git a/examples/sycl/baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..8a665ff63d034
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama/baby-llama.cpp" "baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o" "gcc" "baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/build.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/build.make
new file mode 100644
index 0000000000000..99030e77f7436
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include baby-llama/CMakeFiles/baby-llama.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include baby-llama/CMakeFiles/baby-llama.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include baby-llama/CMakeFiles/baby-llama.dir/flags.make
+
+baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o: baby-llama/CMakeFiles/baby-llama.dir/flags.make
+baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o: ../baby-llama/baby-llama.cpp
+baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o: baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o -MF CMakeFiles/baby-llama.dir/baby-llama.o.d -o CMakeFiles/baby-llama.dir/baby-llama.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama/baby-llama.cpp
+
+baby-llama/CMakeFiles/baby-llama.dir/baby-llama.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/baby-llama.dir/baby-llama.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama/baby-llama.cpp > CMakeFiles/baby-llama.dir/baby-llama.i
+
+baby-llama/CMakeFiles/baby-llama.dir/baby-llama.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/baby-llama.dir/baby-llama.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama/baby-llama.cpp -o CMakeFiles/baby-llama.dir/baby-llama.s
+
+# Object files for target baby-llama
+baby__llama_OBJECTS = \
+"CMakeFiles/baby-llama.dir/baby-llama.o"
+
+# External object files for target baby-llama
+baby__llama_EXTERNAL_OBJECTS =
+
+baby-llama/baby-llama: baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o
+baby-llama/baby-llama: baby-llama/CMakeFiles/baby-llama.dir/build.make
+baby-llama/baby-llama: baby-llama/CMakeFiles/baby-llama.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable baby-llama"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/baby-llama.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+baby-llama/CMakeFiles/baby-llama.dir/build: baby-llama/baby-llama
+.PHONY : baby-llama/CMakeFiles/baby-llama.dir/build
+
+baby-llama/CMakeFiles/baby-llama.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && $(CMAKE_COMMAND) -P CMakeFiles/baby-llama.dir/cmake_clean.cmake
+.PHONY : baby-llama/CMakeFiles/baby-llama.dir/clean
+
+baby-llama/CMakeFiles/baby-llama.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : baby-llama/CMakeFiles/baby-llama.dir/depend
+
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/cmake_clean.cmake b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..6a49649a74e35
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/baby-llama.dir/baby-llama.o"
+  "CMakeFiles/baby-llama.dir/baby-llama.o.d"
+  "baby-llama"
+  "baby-llama.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/baby-llama.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make
new file mode 100644
index 0000000000000..f8bff7f24a6ad
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for baby-llama.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..2ec3a9c66baba
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for baby-llama.
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/depend.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/depend.make
new file mode 100644
index 0000000000000..b8284dc182864
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for baby-llama.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/flags.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/link.txt b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/link.txt
new file mode 100644
index 0000000000000..c9ea6a9618a17
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/baby-llama.dir/baby-llama.o -o baby-llama  -lcommon -lllama 
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/progress.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/progress.make
new file mode 100644
index 0000000000000..abadeb0c3abaa
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 1
+CMAKE_PROGRESS_2 = 2
+
diff --git a/examples/sycl/baby-llama/CMakeFiles/progress.marks b/examples/sycl/baby-llama/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/baby-llama/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/baby-llama/Makefile b/examples/sycl/baby-llama/Makefile
new file mode 100644
index 0000000000000..8bf34c6f86aa5
--- /dev/null
+++ b/examples/sycl/baby-llama/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+baby-llama/CMakeFiles/baby-llama.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/CMakeFiles/baby-llama.dir/rule
+.PHONY : baby-llama/CMakeFiles/baby-llama.dir/rule
+
+# Convenience name for target.
+baby-llama: baby-llama/CMakeFiles/baby-llama.dir/rule
+.PHONY : baby-llama
+
+# fast build rule for target.
+baby-llama/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/build
+.PHONY : baby-llama/fast
+
+# target to build an object file
+baby-llama.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o
+.PHONY : baby-llama.o
+
+# target to preprocess a source file
+baby-llama.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/baby-llama.i
+.PHONY : baby-llama.i
+
+# target to generate assembly for a file
+baby-llama.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/baby-llama.s
+.PHONY : baby-llama.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... baby-llama"
+	@echo "... baby-llama.o"
+	@echo "... baby-llama.i"
+	@echo "... baby-llama.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/baby-llama/cmake_install.cmake b/examples/sycl/baby-llama/cmake_install.cmake
new file mode 100644
index 0000000000000..86ee23ec75026
--- /dev/null
+++ b/examples/sycl/baby-llama/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/baby-llama")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..13582c409e5ac
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench/batched-bench.cpp" "batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o" "gcc" "batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/build.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/build.make
new file mode 100644
index 0000000000000..bf64db49915de
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include batched-bench/CMakeFiles/batched-bench.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include batched-bench/CMakeFiles/batched-bench.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include batched-bench/CMakeFiles/batched-bench.dir/flags.make
+
+batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o: batched-bench/CMakeFiles/batched-bench.dir/flags.make
+batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o: ../batched-bench/batched-bench.cpp
+batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o: batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o -MF CMakeFiles/batched-bench.dir/batched-bench.o.d -o CMakeFiles/batched-bench.dir/batched-bench.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench/batched-bench.cpp
+
+batched-bench/CMakeFiles/batched-bench.dir/batched-bench.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/batched-bench.dir/batched-bench.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench/batched-bench.cpp > CMakeFiles/batched-bench.dir/batched-bench.i
+
+batched-bench/CMakeFiles/batched-bench.dir/batched-bench.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/batched-bench.dir/batched-bench.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench/batched-bench.cpp -o CMakeFiles/batched-bench.dir/batched-bench.s
+
+# Object files for target batched-bench
+batched__bench_OBJECTS = \
+"CMakeFiles/batched-bench.dir/batched-bench.o"
+
+# External object files for target batched-bench
+batched__bench_EXTERNAL_OBJECTS =
+
+batched-bench/batched-bench: batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o
+batched-bench/batched-bench: batched-bench/CMakeFiles/batched-bench.dir/build.make
+batched-bench/batched-bench: batched-bench/CMakeFiles/batched-bench.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable batched-bench"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/batched-bench.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+batched-bench/CMakeFiles/batched-bench.dir/build: batched-bench/batched-bench
+.PHONY : batched-bench/CMakeFiles/batched-bench.dir/build
+
+batched-bench/CMakeFiles/batched-bench.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && $(CMAKE_COMMAND) -P CMakeFiles/batched-bench.dir/cmake_clean.cmake
+.PHONY : batched-bench/CMakeFiles/batched-bench.dir/clean
+
+batched-bench/CMakeFiles/batched-bench.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : batched-bench/CMakeFiles/batched-bench.dir/depend
+
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/cmake_clean.cmake b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..96618af56680c
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/batched-bench.dir/batched-bench.o"
+  "CMakeFiles/batched-bench.dir/batched-bench.o.d"
+  "batched-bench"
+  "batched-bench.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/batched-bench.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make
new file mode 100644
index 0000000000000..a0119aa940c10
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for batched-bench.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..dce1c2dad71f8
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for batched-bench.
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/depend.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/depend.make
new file mode 100644
index 0000000000000..163222cab2552
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for batched-bench.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/flags.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/link.txt b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/link.txt
new file mode 100644
index 0000000000000..a2173ac6a4c48
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/batched-bench.dir/batched-bench.o -o batched-bench  -lcommon -lllama 
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/progress.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/progress.make
new file mode 100644
index 0000000000000..3a86673aa7c18
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 5
+CMAKE_PROGRESS_2 = 6
+
diff --git a/examples/sycl/batched-bench/CMakeFiles/progress.marks b/examples/sycl/batched-bench/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/batched-bench/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/batched-bench/Makefile b/examples/sycl/batched-bench/Makefile
new file mode 100644
index 0000000000000..9643d8cd01572
--- /dev/null
+++ b/examples/sycl/batched-bench/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+batched-bench/CMakeFiles/batched-bench.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/CMakeFiles/batched-bench.dir/rule
+.PHONY : batched-bench/CMakeFiles/batched-bench.dir/rule
+
+# Convenience name for target.
+batched-bench: batched-bench/CMakeFiles/batched-bench.dir/rule
+.PHONY : batched-bench
+
+# fast build rule for target.
+batched-bench/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/build
+.PHONY : batched-bench/fast
+
+# target to build an object file
+batched-bench.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o
+.PHONY : batched-bench.o
+
+# target to preprocess a source file
+batched-bench.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/batched-bench.i
+.PHONY : batched-bench.i
+
+# target to generate assembly for a file
+batched-bench.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/batched-bench.s
+.PHONY : batched-bench.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... batched-bench"
+	@echo "... batched-bench.o"
+	@echo "... batched-bench.i"
+	@echo "... batched-bench.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/batched-bench/cmake_install.cmake b/examples/sycl/batched-bench/cmake_install.cmake
new file mode 100644
index 0000000000000..baa6e752d936f
--- /dev/null
+++ b/examples/sycl/batched-bench/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/batched-bench")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/batched/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/batched/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake b/examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..cfa8cdcb00d4e
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/batched/batched.cpp" "batched/CMakeFiles/batched.dir/batched.o" "gcc" "batched/CMakeFiles/batched.dir/batched.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/build.make b/examples/sycl/batched/CMakeFiles/batched.dir/build.make
new file mode 100644
index 0000000000000..b69a5c8e00476
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/batched.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include batched/CMakeFiles/batched.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include batched/CMakeFiles/batched.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include batched/CMakeFiles/batched.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include batched/CMakeFiles/batched.dir/flags.make
+
+batched/CMakeFiles/batched.dir/batched.o: batched/CMakeFiles/batched.dir/flags.make
+batched/CMakeFiles/batched.dir/batched.o: ../batched/batched.cpp
+batched/CMakeFiles/batched.dir/batched.o: batched/CMakeFiles/batched.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object batched/CMakeFiles/batched.dir/batched.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT batched/CMakeFiles/batched.dir/batched.o -MF CMakeFiles/batched.dir/batched.o.d -o CMakeFiles/batched.dir/batched.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/batched/batched.cpp
+
+batched/CMakeFiles/batched.dir/batched.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/batched.dir/batched.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/batched/batched.cpp > CMakeFiles/batched.dir/batched.i
+
+batched/CMakeFiles/batched.dir/batched.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/batched.dir/batched.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/batched/batched.cpp -o CMakeFiles/batched.dir/batched.s
+
+# Object files for target batched
+batched_OBJECTS = \
+"CMakeFiles/batched.dir/batched.o"
+
+# External object files for target batched
+batched_EXTERNAL_OBJECTS =
+
+batched/batched: batched/CMakeFiles/batched.dir/batched.o
+batched/batched: batched/CMakeFiles/batched.dir/build.make
+batched/batched: batched/CMakeFiles/batched.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable batched"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/batched.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+batched/CMakeFiles/batched.dir/build: batched/batched
+.PHONY : batched/CMakeFiles/batched.dir/build
+
+batched/CMakeFiles/batched.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && $(CMAKE_COMMAND) -P CMakeFiles/batched.dir/cmake_clean.cmake
+.PHONY : batched/CMakeFiles/batched.dir/clean
+
+batched/CMakeFiles/batched.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/batched /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : batched/CMakeFiles/batched.dir/depend
+
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/cmake_clean.cmake b/examples/sycl/batched/CMakeFiles/batched.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..cec64ae00c700
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/batched.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/batched.dir/batched.o"
+  "CMakeFiles/batched.dir/batched.o.d"
+  "batched"
+  "batched.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/batched.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.make b/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.make
new file mode 100644
index 0000000000000..9a449d3648e98
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for batched.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.ts b/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..803668d02f618
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for batched.
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/depend.make b/examples/sycl/batched/CMakeFiles/batched.dir/depend.make
new file mode 100644
index 0000000000000..c583466f650c2
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/batched.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for batched.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/flags.make b/examples/sycl/batched/CMakeFiles/batched.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/batched.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/link.txt b/examples/sycl/batched/CMakeFiles/batched.dir/link.txt
new file mode 100644
index 0000000000000..26eb25de70ccc
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/batched.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/batched.dir/batched.o -o batched  -lcommon -lllama 
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/progress.make b/examples/sycl/batched/CMakeFiles/batched.dir/progress.make
new file mode 100644
index 0000000000000..8c8fb6fbbc138
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/batched.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 3
+CMAKE_PROGRESS_2 = 4
+
diff --git a/examples/sycl/batched/CMakeFiles/progress.marks b/examples/sycl/batched/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/batched/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/batched/Makefile b/examples/sycl/batched/Makefile
new file mode 100644
index 0000000000000..ce954a5513214
--- /dev/null
+++ b/examples/sycl/batched/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+batched/CMakeFiles/batched.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/CMakeFiles/batched.dir/rule
+.PHONY : batched/CMakeFiles/batched.dir/rule
+
+# Convenience name for target.
+batched: batched/CMakeFiles/batched.dir/rule
+.PHONY : batched
+
+# fast build rule for target.
+batched/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/build
+.PHONY : batched/fast
+
+# target to build an object file
+batched.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/batched.o
+.PHONY : batched.o
+
+# target to preprocess a source file
+batched.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/batched.i
+.PHONY : batched.i
+
+# target to generate assembly for a file
+batched.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/batched.s
+.PHONY : batched.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... batched"
+	@echo "... batched.o"
+	@echo "... batched.i"
+	@echo "... batched.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/batched/cmake_install.cmake b/examples/sycl/batched/cmake_install.cmake
new file mode 100644
index 0000000000000..e8c0214cfe960
--- /dev/null
+++ b/examples/sycl/batched/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/batched
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/batched")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/beam-search/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/beam-search/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..7822f7a7ca873
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/beam-search/beam-search.cpp" "beam-search/CMakeFiles/beam-search.dir/beam-search.o" "gcc" "beam-search/CMakeFiles/beam-search.dir/beam-search.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/build.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/build.make
new file mode 100644
index 0000000000000..e462d1b34af33
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include beam-search/CMakeFiles/beam-search.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include beam-search/CMakeFiles/beam-search.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include beam-search/CMakeFiles/beam-search.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include beam-search/CMakeFiles/beam-search.dir/flags.make
+
+beam-search/CMakeFiles/beam-search.dir/beam-search.o: beam-search/CMakeFiles/beam-search.dir/flags.make
+beam-search/CMakeFiles/beam-search.dir/beam-search.o: ../beam-search/beam-search.cpp
+beam-search/CMakeFiles/beam-search.dir/beam-search.o: beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object beam-search/CMakeFiles/beam-search.dir/beam-search.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT beam-search/CMakeFiles/beam-search.dir/beam-search.o -MF CMakeFiles/beam-search.dir/beam-search.o.d -o CMakeFiles/beam-search.dir/beam-search.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search/beam-search.cpp
+
+beam-search/CMakeFiles/beam-search.dir/beam-search.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/beam-search.dir/beam-search.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search/beam-search.cpp > CMakeFiles/beam-search.dir/beam-search.i
+
+beam-search/CMakeFiles/beam-search.dir/beam-search.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/beam-search.dir/beam-search.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search/beam-search.cpp -o CMakeFiles/beam-search.dir/beam-search.s
+
+# Object files for target beam-search
+beam__search_OBJECTS = \
+"CMakeFiles/beam-search.dir/beam-search.o"
+
+# External object files for target beam-search
+beam__search_EXTERNAL_OBJECTS =
+
+beam-search/beam-search: beam-search/CMakeFiles/beam-search.dir/beam-search.o
+beam-search/beam-search: beam-search/CMakeFiles/beam-search.dir/build.make
+beam-search/beam-search: beam-search/CMakeFiles/beam-search.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable beam-search"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/beam-search.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+beam-search/CMakeFiles/beam-search.dir/build: beam-search/beam-search
+.PHONY : beam-search/CMakeFiles/beam-search.dir/build
+
+beam-search/CMakeFiles/beam-search.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && $(CMAKE_COMMAND) -P CMakeFiles/beam-search.dir/cmake_clean.cmake
+.PHONY : beam-search/CMakeFiles/beam-search.dir/clean
+
+beam-search/CMakeFiles/beam-search.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : beam-search/CMakeFiles/beam-search.dir/depend
+
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/cmake_clean.cmake b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..e28d241a266ec
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/beam-search.dir/beam-search.o"
+  "CMakeFiles/beam-search.dir/beam-search.o.d"
+  "beam-search"
+  "beam-search.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/beam-search.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.make
new file mode 100644
index 0000000000000..7b9ac25582ada
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for beam-search.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..1117fb6f80e48
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for beam-search.
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/depend.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/depend.make
new file mode 100644
index 0000000000000..37ec7a3f8206c
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for beam-search.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/flags.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/link.txt b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/link.txt
new file mode 100644
index 0000000000000..1acfe16a7a283
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/beam-search.dir/beam-search.o -o beam-search  -lcommon -lllama 
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/progress.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/progress.make
new file mode 100644
index 0000000000000..72bb7dd025afc
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 7
+CMAKE_PROGRESS_2 = 8
+
diff --git a/examples/sycl/beam-search/CMakeFiles/progress.marks b/examples/sycl/beam-search/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/beam-search/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/beam-search/Makefile b/examples/sycl/beam-search/Makefile
new file mode 100644
index 0000000000000..a1a902339db93
--- /dev/null
+++ b/examples/sycl/beam-search/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+beam-search/CMakeFiles/beam-search.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/CMakeFiles/beam-search.dir/rule
+.PHONY : beam-search/CMakeFiles/beam-search.dir/rule
+
+# Convenience name for target.
+beam-search: beam-search/CMakeFiles/beam-search.dir/rule
+.PHONY : beam-search
+
+# fast build rule for target.
+beam-search/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/build
+.PHONY : beam-search/fast
+
+# target to build an object file
+beam-search.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/beam-search.o
+.PHONY : beam-search.o
+
+# target to preprocess a source file
+beam-search.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/beam-search.i
+.PHONY : beam-search.i
+
+# target to generate assembly for a file
+beam-search.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/beam-search.s
+.PHONY : beam-search.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... beam-search"
+	@echo "... beam-search.o"
+	@echo "... beam-search.i"
+	@echo "... beam-search.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/beam-search/cmake_install.cmake b/examples/sycl/beam-search/cmake_install.cmake
new file mode 100644
index 0000000000000..33cf7cf46a32a
--- /dev/null
+++ b/examples/sycl/beam-search/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/beam-search")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/benchmark/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/benchmark/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..25e6406f3ed78
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/benchmark-matmult.cpp" "benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o" "gcc" "benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/build.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/build.make
new file mode 100644
index 0000000000000..84fb06db47923
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include benchmark/CMakeFiles/benchmark.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include benchmark/CMakeFiles/benchmark.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include benchmark/CMakeFiles/benchmark.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include benchmark/CMakeFiles/benchmark.dir/flags.make
+
+benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o: benchmark/CMakeFiles/benchmark.dir/flags.make
+benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o: ../benchmark/benchmark-matmult.cpp
+benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o: benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o -MF CMakeFiles/benchmark.dir/benchmark-matmult.o.d -o CMakeFiles/benchmark.dir/benchmark-matmult.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/benchmark-matmult.cpp
+
+benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/benchmark.dir/benchmark-matmult.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/benchmark-matmult.cpp > CMakeFiles/benchmark.dir/benchmark-matmult.i
+
+benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/benchmark.dir/benchmark-matmult.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/benchmark-matmult.cpp -o CMakeFiles/benchmark.dir/benchmark-matmult.s
+
+# Object files for target benchmark
+benchmark_OBJECTS = \
+"CMakeFiles/benchmark.dir/benchmark-matmult.o"
+
+# External object files for target benchmark
+benchmark_EXTERNAL_OBJECTS =
+
+benchmark/benchmark: benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o
+benchmark/benchmark: benchmark/CMakeFiles/benchmark.dir/build.make
+benchmark/benchmark: benchmark/CMakeFiles/benchmark.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable benchmark"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/benchmark.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+benchmark/CMakeFiles/benchmark.dir/build: benchmark/benchmark
+.PHONY : benchmark/CMakeFiles/benchmark.dir/build
+
+benchmark/CMakeFiles/benchmark.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && $(CMAKE_COMMAND) -P CMakeFiles/benchmark.dir/cmake_clean.cmake
+.PHONY : benchmark/CMakeFiles/benchmark.dir/clean
+
+benchmark/CMakeFiles/benchmark.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : benchmark/CMakeFiles/benchmark.dir/depend
+
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/cmake_clean.cmake b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..870df51f87439
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/benchmark.dir/benchmark-matmult.o"
+  "CMakeFiles/benchmark.dir/benchmark-matmult.o.d"
+  "benchmark"
+  "benchmark.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/benchmark.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.make
new file mode 100644
index 0000000000000..871e0bce58355
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for benchmark.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..66ee2110c21b7
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for benchmark.
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/depend.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/depend.make
new file mode 100644
index 0000000000000..71cca2eeae086
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for benchmark.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/flags.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/flags.make
new file mode 100644
index 0000000000000..d799cee981f62
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/../../common
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/link.txt b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/link.txt
new file mode 100644
index 0000000000000..f5f38e36cd745
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/benchmark.dir/benchmark-matmult.o -o benchmark  -lllama -lbuild_info 
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/progress.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/progress.make
new file mode 100644
index 0000000000000..b700c2c902219
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 9
+CMAKE_PROGRESS_2 = 10
+
diff --git a/examples/sycl/benchmark/CMakeFiles/progress.marks b/examples/sycl/benchmark/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/benchmark/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/benchmark/Makefile b/examples/sycl/benchmark/Makefile
new file mode 100644
index 0000000000000..68795570edc0b
--- /dev/null
+++ b/examples/sycl/benchmark/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+benchmark/CMakeFiles/benchmark.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/CMakeFiles/benchmark.dir/rule
+.PHONY : benchmark/CMakeFiles/benchmark.dir/rule
+
+# Convenience name for target.
+benchmark: benchmark/CMakeFiles/benchmark.dir/rule
+.PHONY : benchmark
+
+# fast build rule for target.
+benchmark/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/build
+.PHONY : benchmark/fast
+
+# target to build an object file
+benchmark-matmult.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o
+.PHONY : benchmark-matmult.o
+
+# target to preprocess a source file
+benchmark-matmult.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.i
+.PHONY : benchmark-matmult.i
+
+# target to generate assembly for a file
+benchmark-matmult.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.s
+.PHONY : benchmark-matmult.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... benchmark"
+	@echo "... benchmark-matmult.o"
+	@echo "... benchmark-matmult.i"
+	@echo "... benchmark-matmult.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/benchmark/cmake_install.cmake b/examples/sycl/benchmark/cmake_install.cmake
new file mode 100644
index 0000000000000..d2a89c99230ff
--- /dev/null
+++ b/examples/sycl/benchmark/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/benchmark")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/cmake_install.cmake b/examples/sycl/cmake_install.cmake
new file mode 100644
index 0000000000000..e365c411dabd6
--- /dev/null
+++ b/examples/sycl/cmake_install.cmake
@@ -0,0 +1,84 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if(NOT CMAKE_INSTALL_LOCAL_ONLY)
+  # Include the install script for each subdirectory.
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/cmake_install.cmake")
+  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/cmake_install.cmake")
+
+endif()
+
+if(CMAKE_INSTALL_COMPONENT)
+  set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INSTALL_COMPONENT}.txt")
+else()
+  set(CMAKE_INSTALL_MANIFEST "install_manifest.txt")
+endif()
+
+string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT
+       "${CMAKE_INSTALL_MANIFEST_FILES}")
+file(WRITE "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/${CMAKE_INSTALL_MANIFEST}"
+     "${CMAKE_INSTALL_MANIFEST_CONTENT}")
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..54c704ca3ca01
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp" "convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o" "gcc" "convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make
new file mode 100644
index 0000000000000..ee8cc8e191ff9
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
+
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o: ../convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o -MF CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o.d -o CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
+
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp > CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.i
+
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp -o CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.s
+
+# Object files for target convert-llama2c-to-ggml
+convert__llama2c__to__ggml_OBJECTS = \
+"CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o"
+
+# External object files for target convert-llama2c-to-ggml
+convert__llama2c__to__ggml_EXTERNAL_OBJECTS =
+
+convert-llama2c-to-ggml/convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o
+convert-llama2c-to-ggml/convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make
+convert-llama2c-to-ggml/convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable convert-llama2c-to-ggml"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/convert-llama2c-to-ggml.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build: convert-llama2c-to-ggml/convert-llama2c-to-ggml
+.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build
+
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && $(CMAKE_COMMAND) -P CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake
+.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean
+
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend
+
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..27a9734b51264
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o"
+  "CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o.d"
+  "convert-llama2c-to-ggml"
+  "convert-llama2c-to-ggml.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make
new file mode 100644
index 0000000000000..0c7de424c1697
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for convert-llama2c-to-ggml.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..4db962ade5dfe
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for convert-llama2c-to-ggml.
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make
new file mode 100644
index 0000000000000..7a48cf562a9dc
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for convert-llama2c-to-ggml.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt
new file mode 100644
index 0000000000000..fadbf2651cb1d
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o -o convert-llama2c-to-ggml  -lcommon -lllama 
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make
new file mode 100644
index 0000000000000..596289c0fd56a
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 11
+CMAKE_PROGRESS_2 = 12
+
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/progress.marks b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/convert-llama2c-to-ggml/Makefile b/examples/sycl/convert-llama2c-to-ggml/Makefile
new file mode 100644
index 0000000000000..37c3f05fd314d
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
+.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
+
+# Convenience name for target.
+convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
+.PHONY : convert-llama2c-to-ggml
+
+# fast build rule for target.
+convert-llama2c-to-ggml/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build
+.PHONY : convert-llama2c-to-ggml/fast
+
+# target to build an object file
+convert-llama2c-to-ggml.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o
+.PHONY : convert-llama2c-to-ggml.o
+
+# target to preprocess a source file
+convert-llama2c-to-ggml.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.i
+.PHONY : convert-llama2c-to-ggml.i
+
+# target to generate assembly for a file
+convert-llama2c-to-ggml.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.s
+.PHONY : convert-llama2c-to-ggml.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... convert-llama2c-to-ggml"
+	@echo "... convert-llama2c-to-ggml.o"
+	@echo "... convert-llama2c-to-ggml.i"
+	@echo "... convert-llama2c-to-ggml.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake b/examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake
new file mode 100644
index 0000000000000..4e812cd016981
--- /dev/null
+++ b/examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/convert-llama2c-to-ggml")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/embedding/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/embedding/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake b/examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..212c12d98822f
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/embedding/embedding.cpp" "embedding/CMakeFiles/embedding.dir/embedding.o" "gcc" "embedding/CMakeFiles/embedding.dir/embedding.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/build.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/build.make
new file mode 100644
index 0000000000000..a5788eed19ffa
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/embedding.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include embedding/CMakeFiles/embedding.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include embedding/CMakeFiles/embedding.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include embedding/CMakeFiles/embedding.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include embedding/CMakeFiles/embedding.dir/flags.make
+
+embedding/CMakeFiles/embedding.dir/embedding.o: embedding/CMakeFiles/embedding.dir/flags.make
+embedding/CMakeFiles/embedding.dir/embedding.o: ../embedding/embedding.cpp
+embedding/CMakeFiles/embedding.dir/embedding.o: embedding/CMakeFiles/embedding.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object embedding/CMakeFiles/embedding.dir/embedding.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT embedding/CMakeFiles/embedding.dir/embedding.o -MF CMakeFiles/embedding.dir/embedding.o.d -o CMakeFiles/embedding.dir/embedding.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/embedding/embedding.cpp
+
+embedding/CMakeFiles/embedding.dir/embedding.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/embedding.dir/embedding.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/embedding/embedding.cpp > CMakeFiles/embedding.dir/embedding.i
+
+embedding/CMakeFiles/embedding.dir/embedding.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/embedding.dir/embedding.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/embedding/embedding.cpp -o CMakeFiles/embedding.dir/embedding.s
+
+# Object files for target embedding
+embedding_OBJECTS = \
+"CMakeFiles/embedding.dir/embedding.o"
+
+# External object files for target embedding
+embedding_EXTERNAL_OBJECTS =
+
+embedding/embedding: embedding/CMakeFiles/embedding.dir/embedding.o
+embedding/embedding: embedding/CMakeFiles/embedding.dir/build.make
+embedding/embedding: embedding/CMakeFiles/embedding.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable embedding"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/embedding.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+embedding/CMakeFiles/embedding.dir/build: embedding/embedding
+.PHONY : embedding/CMakeFiles/embedding.dir/build
+
+embedding/CMakeFiles/embedding.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && $(CMAKE_COMMAND) -P CMakeFiles/embedding.dir/cmake_clean.cmake
+.PHONY : embedding/CMakeFiles/embedding.dir/clean
+
+embedding/CMakeFiles/embedding.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/embedding /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : embedding/CMakeFiles/embedding.dir/depend
+
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/cmake_clean.cmake b/examples/sycl/embedding/CMakeFiles/embedding.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..e6d17db5dfb54
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/embedding.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/embedding.dir/embedding.o"
+  "CMakeFiles/embedding.dir/embedding.o.d"
+  "embedding"
+  "embedding.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/embedding.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.make
new file mode 100644
index 0000000000000..51d484f9f0010
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for embedding.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.ts b/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..af47aa36d337d
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for embedding.
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/depend.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/depend.make
new file mode 100644
index 0000000000000..12b2e39ad1bd4
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/embedding.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for embedding.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/flags.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/embedding.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/link.txt b/examples/sycl/embedding/CMakeFiles/embedding.dir/link.txt
new file mode 100644
index 0000000000000..7d5713ba926ec
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/embedding.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/embedding.dir/embedding.o -o embedding  -lcommon -lllama 
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/progress.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/progress.make
new file mode 100644
index 0000000000000..d92f75a2935ea
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/embedding.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 13
+CMAKE_PROGRESS_2 = 14
+
diff --git a/examples/sycl/embedding/CMakeFiles/progress.marks b/examples/sycl/embedding/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/embedding/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/embedding/Makefile b/examples/sycl/embedding/Makefile
new file mode 100644
index 0000000000000..588a49a8da14d
--- /dev/null
+++ b/examples/sycl/embedding/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+embedding/CMakeFiles/embedding.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/CMakeFiles/embedding.dir/rule
+.PHONY : embedding/CMakeFiles/embedding.dir/rule
+
+# Convenience name for target.
+embedding: embedding/CMakeFiles/embedding.dir/rule
+.PHONY : embedding
+
+# fast build rule for target.
+embedding/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/build
+.PHONY : embedding/fast
+
+# target to build an object file
+embedding.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/embedding.o
+.PHONY : embedding.o
+
+# target to preprocess a source file
+embedding.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/embedding.i
+.PHONY : embedding.i
+
+# target to generate assembly for a file
+embedding.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/embedding.s
+.PHONY : embedding.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... embedding"
+	@echo "... embedding.o"
+	@echo "... embedding.i"
+	@echo "... embedding.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/embedding/cmake_install.cmake b/examples/sycl/embedding/cmake_install.cmake
new file mode 100644
index 0000000000000..1ed94ada9372d
--- /dev/null
+++ b/examples/sycl/embedding/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/embedding
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/embedding")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/export-lora/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/export-lora/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..509dd9fd84d2f
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/export-lora/export-lora.cpp" "export-lora/CMakeFiles/export-lora.dir/export-lora.o" "gcc" "export-lora/CMakeFiles/export-lora.dir/export-lora.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/build.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/build.make
new file mode 100644
index 0000000000000..a30d776294059
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include export-lora/CMakeFiles/export-lora.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include export-lora/CMakeFiles/export-lora.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include export-lora/CMakeFiles/export-lora.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include export-lora/CMakeFiles/export-lora.dir/flags.make
+
+export-lora/CMakeFiles/export-lora.dir/export-lora.o: export-lora/CMakeFiles/export-lora.dir/flags.make
+export-lora/CMakeFiles/export-lora.dir/export-lora.o: ../export-lora/export-lora.cpp
+export-lora/CMakeFiles/export-lora.dir/export-lora.o: export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object export-lora/CMakeFiles/export-lora.dir/export-lora.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT export-lora/CMakeFiles/export-lora.dir/export-lora.o -MF CMakeFiles/export-lora.dir/export-lora.o.d -o CMakeFiles/export-lora.dir/export-lora.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora/export-lora.cpp
+
+export-lora/CMakeFiles/export-lora.dir/export-lora.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/export-lora.dir/export-lora.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora/export-lora.cpp > CMakeFiles/export-lora.dir/export-lora.i
+
+export-lora/CMakeFiles/export-lora.dir/export-lora.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/export-lora.dir/export-lora.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora/export-lora.cpp -o CMakeFiles/export-lora.dir/export-lora.s
+
+# Object files for target export-lora
+export__lora_OBJECTS = \
+"CMakeFiles/export-lora.dir/export-lora.o"
+
+# External object files for target export-lora
+export__lora_EXTERNAL_OBJECTS =
+
+export-lora/export-lora: export-lora/CMakeFiles/export-lora.dir/export-lora.o
+export-lora/export-lora: export-lora/CMakeFiles/export-lora.dir/build.make
+export-lora/export-lora: export-lora/CMakeFiles/export-lora.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable export-lora"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/export-lora.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+export-lora/CMakeFiles/export-lora.dir/build: export-lora/export-lora
+.PHONY : export-lora/CMakeFiles/export-lora.dir/build
+
+export-lora/CMakeFiles/export-lora.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && $(CMAKE_COMMAND) -P CMakeFiles/export-lora.dir/cmake_clean.cmake
+.PHONY : export-lora/CMakeFiles/export-lora.dir/clean
+
+export-lora/CMakeFiles/export-lora.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : export-lora/CMakeFiles/export-lora.dir/depend
+
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/cmake_clean.cmake b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..e4439996f1cc8
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/export-lora.dir/export-lora.o"
+  "CMakeFiles/export-lora.dir/export-lora.o.d"
+  "export-lora"
+  "export-lora.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/export-lora.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.make
new file mode 100644
index 0000000000000..bd6b5614f8f24
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for export-lora.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..060cd6097e48a
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for export-lora.
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/depend.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/depend.make
new file mode 100644
index 0000000000000..98a9b14434306
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for export-lora.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/flags.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/link.txt b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/link.txt
new file mode 100644
index 0000000000000..17cb92024e35d
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/export-lora.dir/export-lora.o -o export-lora  -lcommon -lllama 
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/progress.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/progress.make
new file mode 100644
index 0000000000000..a35c33b98d591
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 15
+CMAKE_PROGRESS_2 = 16
+
diff --git a/examples/sycl/export-lora/CMakeFiles/progress.marks b/examples/sycl/export-lora/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/export-lora/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/export-lora/Makefile b/examples/sycl/export-lora/Makefile
new file mode 100644
index 0000000000000..81a26eebe5704
--- /dev/null
+++ b/examples/sycl/export-lora/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+export-lora/CMakeFiles/export-lora.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/CMakeFiles/export-lora.dir/rule
+.PHONY : export-lora/CMakeFiles/export-lora.dir/rule
+
+# Convenience name for target.
+export-lora: export-lora/CMakeFiles/export-lora.dir/rule
+.PHONY : export-lora
+
+# fast build rule for target.
+export-lora/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/build
+.PHONY : export-lora/fast
+
+# target to build an object file
+export-lora.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/export-lora.o
+.PHONY : export-lora.o
+
+# target to preprocess a source file
+export-lora.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/export-lora.i
+.PHONY : export-lora.i
+
+# target to generate assembly for a file
+export-lora.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/export-lora.s
+.PHONY : export-lora.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... export-lora"
+	@echo "... export-lora.o"
+	@echo "... export-lora.i"
+	@echo "... export-lora.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/export-lora/cmake_install.cmake b/examples/sycl/export-lora/cmake_install.cmake
new file mode 100644
index 0000000000000..82df354c0669e
--- /dev/null
+++ b/examples/sycl/export-lora/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/export-lora")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/finetune/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/finetune/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake b/examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..1c4b740826406
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/finetune/finetune.cpp" "finetune/CMakeFiles/finetune.dir/finetune.o" "gcc" "finetune/CMakeFiles/finetune.dir/finetune.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/build.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/build.make
new file mode 100644
index 0000000000000..87eb77fdf4f6e
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/finetune.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include finetune/CMakeFiles/finetune.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include finetune/CMakeFiles/finetune.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include finetune/CMakeFiles/finetune.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include finetune/CMakeFiles/finetune.dir/flags.make
+
+finetune/CMakeFiles/finetune.dir/finetune.o: finetune/CMakeFiles/finetune.dir/flags.make
+finetune/CMakeFiles/finetune.dir/finetune.o: ../finetune/finetune.cpp
+finetune/CMakeFiles/finetune.dir/finetune.o: finetune/CMakeFiles/finetune.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object finetune/CMakeFiles/finetune.dir/finetune.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT finetune/CMakeFiles/finetune.dir/finetune.o -MF CMakeFiles/finetune.dir/finetune.o.d -o CMakeFiles/finetune.dir/finetune.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/finetune/finetune.cpp
+
+finetune/CMakeFiles/finetune.dir/finetune.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/finetune.dir/finetune.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/finetune/finetune.cpp > CMakeFiles/finetune.dir/finetune.i
+
+finetune/CMakeFiles/finetune.dir/finetune.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/finetune.dir/finetune.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/finetune/finetune.cpp -o CMakeFiles/finetune.dir/finetune.s
+
+# Object files for target finetune
+finetune_OBJECTS = \
+"CMakeFiles/finetune.dir/finetune.o"
+
+# External object files for target finetune
+finetune_EXTERNAL_OBJECTS =
+
+finetune/finetune: finetune/CMakeFiles/finetune.dir/finetune.o
+finetune/finetune: finetune/CMakeFiles/finetune.dir/build.make
+finetune/finetune: finetune/CMakeFiles/finetune.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable finetune"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/finetune.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+finetune/CMakeFiles/finetune.dir/build: finetune/finetune
+.PHONY : finetune/CMakeFiles/finetune.dir/build
+
+finetune/CMakeFiles/finetune.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && $(CMAKE_COMMAND) -P CMakeFiles/finetune.dir/cmake_clean.cmake
+.PHONY : finetune/CMakeFiles/finetune.dir/clean
+
+finetune/CMakeFiles/finetune.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/finetune /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : finetune/CMakeFiles/finetune.dir/depend
+
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/cmake_clean.cmake b/examples/sycl/finetune/CMakeFiles/finetune.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..ddc16e11e3bb0
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/finetune.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/finetune.dir/finetune.o"
+  "CMakeFiles/finetune.dir/finetune.o.d"
+  "finetune"
+  "finetune.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/finetune.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.make
new file mode 100644
index 0000000000000..7b0c0a165e815
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for finetune.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.ts b/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..60ac2baeed3ac
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for finetune.
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/depend.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/depend.make
new file mode 100644
index 0000000000000..69e9f67763a99
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/finetune.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for finetune.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/flags.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/finetune.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/link.txt b/examples/sycl/finetune/CMakeFiles/finetune.dir/link.txt
new file mode 100644
index 0000000000000..c93473a88cc6a
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/finetune.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/finetune.dir/finetune.o -o finetune  -lcommon -lllama 
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/progress.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/progress.make
new file mode 100644
index 0000000000000..5a7451db601a4
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/finetune.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 17
+CMAKE_PROGRESS_2 = 18
+
diff --git a/examples/sycl/finetune/CMakeFiles/progress.marks b/examples/sycl/finetune/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/finetune/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/finetune/Makefile b/examples/sycl/finetune/Makefile
new file mode 100644
index 0000000000000..455898767ff8a
--- /dev/null
+++ b/examples/sycl/finetune/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+finetune/CMakeFiles/finetune.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/CMakeFiles/finetune.dir/rule
+.PHONY : finetune/CMakeFiles/finetune.dir/rule
+
+# Convenience name for target.
+finetune: finetune/CMakeFiles/finetune.dir/rule
+.PHONY : finetune
+
+# fast build rule for target.
+finetune/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/build
+.PHONY : finetune/fast
+
+# target to build an object file
+finetune.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/finetune.o
+.PHONY : finetune.o
+
+# target to preprocess a source file
+finetune.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/finetune.i
+.PHONY : finetune.i
+
+# target to generate assembly for a file
+finetune.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/finetune.s
+.PHONY : finetune.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... finetune"
+	@echo "... finetune.o"
+	@echo "... finetune.i"
+	@echo "... finetune.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/finetune/cmake_install.cmake b/examples/sycl/finetune/cmake_install.cmake
new file mode 100644
index 0000000000000..6ec05e0579313
--- /dev/null
+++ b/examples/sycl/finetune/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/finetune
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/finetune")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/infill/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/infill/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake b/examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..d2945cc856f9e
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/infill/infill.cpp" "infill/CMakeFiles/infill.dir/infill.o" "gcc" "infill/CMakeFiles/infill.dir/infill.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/build.make b/examples/sycl/infill/CMakeFiles/infill.dir/build.make
new file mode 100644
index 0000000000000..053f874a44cfb
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/infill.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include infill/CMakeFiles/infill.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include infill/CMakeFiles/infill.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include infill/CMakeFiles/infill.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include infill/CMakeFiles/infill.dir/flags.make
+
+infill/CMakeFiles/infill.dir/infill.o: infill/CMakeFiles/infill.dir/flags.make
+infill/CMakeFiles/infill.dir/infill.o: ../infill/infill.cpp
+infill/CMakeFiles/infill.dir/infill.o: infill/CMakeFiles/infill.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object infill/CMakeFiles/infill.dir/infill.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT infill/CMakeFiles/infill.dir/infill.o -MF CMakeFiles/infill.dir/infill.o.d -o CMakeFiles/infill.dir/infill.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/infill/infill.cpp
+
+infill/CMakeFiles/infill.dir/infill.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/infill.dir/infill.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/infill/infill.cpp > CMakeFiles/infill.dir/infill.i
+
+infill/CMakeFiles/infill.dir/infill.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/infill.dir/infill.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/infill/infill.cpp -o CMakeFiles/infill.dir/infill.s
+
+# Object files for target infill
+infill_OBJECTS = \
+"CMakeFiles/infill.dir/infill.o"
+
+# External object files for target infill
+infill_EXTERNAL_OBJECTS =
+
+infill/infill: infill/CMakeFiles/infill.dir/infill.o
+infill/infill: infill/CMakeFiles/infill.dir/build.make
+infill/infill: infill/CMakeFiles/infill.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable infill"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/infill.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+infill/CMakeFiles/infill.dir/build: infill/infill
+.PHONY : infill/CMakeFiles/infill.dir/build
+
+infill/CMakeFiles/infill.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && $(CMAKE_COMMAND) -P CMakeFiles/infill.dir/cmake_clean.cmake
+.PHONY : infill/CMakeFiles/infill.dir/clean
+
+infill/CMakeFiles/infill.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/infill /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : infill/CMakeFiles/infill.dir/depend
+
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/cmake_clean.cmake b/examples/sycl/infill/CMakeFiles/infill.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..87dd3a112b239
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/infill.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/infill.dir/infill.o"
+  "CMakeFiles/infill.dir/infill.o.d"
+  "infill"
+  "infill.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/infill.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.make b/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.make
new file mode 100644
index 0000000000000..b1ec2da73449c
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for infill.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.ts b/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..c8209ec18aa58
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for infill.
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/depend.make b/examples/sycl/infill/CMakeFiles/infill.dir/depend.make
new file mode 100644
index 0000000000000..7baa2f32944b2
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/infill.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for infill.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/flags.make b/examples/sycl/infill/CMakeFiles/infill.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/infill.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/link.txt b/examples/sycl/infill/CMakeFiles/infill.dir/link.txt
new file mode 100644
index 0000000000000..591c34ed36c6f
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/infill.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/infill.dir/infill.o -o infill  -lcommon -lllama 
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/progress.make b/examples/sycl/infill/CMakeFiles/infill.dir/progress.make
new file mode 100644
index 0000000000000..48b3d8a54961a
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/infill.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 19
+CMAKE_PROGRESS_2 = 20
+
diff --git a/examples/sycl/infill/CMakeFiles/progress.marks b/examples/sycl/infill/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/infill/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/infill/Makefile b/examples/sycl/infill/Makefile
new file mode 100644
index 0000000000000..f380cf65e6242
--- /dev/null
+++ b/examples/sycl/infill/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+infill/CMakeFiles/infill.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/CMakeFiles/infill.dir/rule
+.PHONY : infill/CMakeFiles/infill.dir/rule
+
+# Convenience name for target.
+infill: infill/CMakeFiles/infill.dir/rule
+.PHONY : infill
+
+# fast build rule for target.
+infill/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/build
+.PHONY : infill/fast
+
+# target to build an object file
+infill.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/infill.o
+.PHONY : infill.o
+
+# target to preprocess a source file
+infill.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/infill.i
+.PHONY : infill.i
+
+# target to generate assembly for a file
+infill.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/infill.s
+.PHONY : infill.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... infill"
+	@echo "... infill.o"
+	@echo "... infill.i"
+	@echo "... infill.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/infill/cmake_install.cmake b/examples/sycl/infill/cmake_install.cmake
new file mode 100644
index 0000000000000..75eebb36e8919
--- /dev/null
+++ b/examples/sycl/infill/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/infill
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/infill")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..3474b7e651b40
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench/llama-bench.cpp" "llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o" "gcc" "llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/build.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/build.make
new file mode 100644
index 0000000000000..f5e7cbbbe44f7
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include llama-bench/CMakeFiles/llama-bench.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include llama-bench/CMakeFiles/llama-bench.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include llama-bench/CMakeFiles/llama-bench.dir/flags.make
+
+llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o: llama-bench/CMakeFiles/llama-bench.dir/flags.make
+llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o: ../llama-bench/llama-bench.cpp
+llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o: llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o -MF CMakeFiles/llama-bench.dir/llama-bench.o.d -o CMakeFiles/llama-bench.dir/llama-bench.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench/llama-bench.cpp
+
+llama-bench/CMakeFiles/llama-bench.dir/llama-bench.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/llama-bench.dir/llama-bench.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench/llama-bench.cpp > CMakeFiles/llama-bench.dir/llama-bench.i
+
+llama-bench/CMakeFiles/llama-bench.dir/llama-bench.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/llama-bench.dir/llama-bench.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench/llama-bench.cpp -o CMakeFiles/llama-bench.dir/llama-bench.s
+
+# Object files for target llama-bench
+llama__bench_OBJECTS = \
+"CMakeFiles/llama-bench.dir/llama-bench.o"
+
+# External object files for target llama-bench
+llama__bench_EXTERNAL_OBJECTS =
+
+llama-bench/llama-bench: llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o
+llama-bench/llama-bench: llama-bench/CMakeFiles/llama-bench.dir/build.make
+llama-bench/llama-bench: llama-bench/CMakeFiles/llama-bench.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable llama-bench"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/llama-bench.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+llama-bench/CMakeFiles/llama-bench.dir/build: llama-bench/llama-bench
+.PHONY : llama-bench/CMakeFiles/llama-bench.dir/build
+
+llama-bench/CMakeFiles/llama-bench.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && $(CMAKE_COMMAND) -P CMakeFiles/llama-bench.dir/cmake_clean.cmake
+.PHONY : llama-bench/CMakeFiles/llama-bench.dir/clean
+
+llama-bench/CMakeFiles/llama-bench.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : llama-bench/CMakeFiles/llama-bench.dir/depend
+
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/cmake_clean.cmake b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..cb6655fb83e41
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/llama-bench.dir/llama-bench.o"
+  "CMakeFiles/llama-bench.dir/llama-bench.o.d"
+  "llama-bench"
+  "llama-bench.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/llama-bench.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make
new file mode 100644
index 0000000000000..1c2e072d2d272
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for llama-bench.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..e2fdaf1db7a02
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for llama-bench.
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/depend.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/depend.make
new file mode 100644
index 0000000000000..f3b3c9b066e7b
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for llama-bench.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/flags.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/link.txt b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/link.txt
new file mode 100644
index 0000000000000..f751098bcfad6
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/llama-bench.dir/llama-bench.o -o llama-bench  -lcommon -lllama 
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/progress.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/progress.make
new file mode 100644
index 0000000000000..6ec2abf9db4ad
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 21
+CMAKE_PROGRESS_2 = 22
+
diff --git a/examples/sycl/llama-bench/CMakeFiles/progress.marks b/examples/sycl/llama-bench/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/llama-bench/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/llama-bench/Makefile b/examples/sycl/llama-bench/Makefile
new file mode 100644
index 0000000000000..295a518e0f08f
--- /dev/null
+++ b/examples/sycl/llama-bench/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+llama-bench/CMakeFiles/llama-bench.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/CMakeFiles/llama-bench.dir/rule
+.PHONY : llama-bench/CMakeFiles/llama-bench.dir/rule
+
+# Convenience name for target.
+llama-bench: llama-bench/CMakeFiles/llama-bench.dir/rule
+.PHONY : llama-bench
+
+# fast build rule for target.
+llama-bench/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/build
+.PHONY : llama-bench/fast
+
+# target to build an object file
+llama-bench.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o
+.PHONY : llama-bench.o
+
+# target to preprocess a source file
+llama-bench.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/llama-bench.i
+.PHONY : llama-bench.i
+
+# target to generate assembly for a file
+llama-bench.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/llama-bench.s
+.PHONY : llama-bench.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... llama-bench"
+	@echo "... llama-bench.o"
+	@echo "... llama-bench.i"
+	@echo "... llama-bench.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/llama-bench/cmake_install.cmake b/examples/sycl/llama-bench/cmake_install.cmake
new file mode 100644
index 0000000000000..72de9db7d3b6f
--- /dev/null
+++ b/examples/sycl/llama-bench/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/llama-bench")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/llava/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/llava/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake b/examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..891956fae6f83
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava-cli.cpp" "llava/CMakeFiles/llava-cli.dir/llava-cli.o" "gcc" "llava/CMakeFiles/llava-cli.dir/llava-cli.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/build.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/build.make
new file mode 100644
index 0000000000000..6459cbf3c94c6
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava-cli.dir/build.make
@@ -0,0 +1,114 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include llava/CMakeFiles/llava-cli.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include llava/CMakeFiles/llava-cli.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include llava/CMakeFiles/llava-cli.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include llava/CMakeFiles/llava-cli.dir/flags.make
+
+llava/CMakeFiles/llava-cli.dir/llava-cli.o: llava/CMakeFiles/llava-cli.dir/flags.make
+llava/CMakeFiles/llava-cli.dir/llava-cli.o: ../llava/llava-cli.cpp
+llava/CMakeFiles/llava-cli.dir/llava-cli.o: llava/CMakeFiles/llava-cli.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object llava/CMakeFiles/llava-cli.dir/llava-cli.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT llava/CMakeFiles/llava-cli.dir/llava-cli.o -MF CMakeFiles/llava-cli.dir/llava-cli.o.d -o CMakeFiles/llava-cli.dir/llava-cli.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava-cli.cpp
+
+llava/CMakeFiles/llava-cli.dir/llava-cli.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/llava-cli.dir/llava-cli.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava-cli.cpp > CMakeFiles/llava-cli.dir/llava-cli.i
+
+llava/CMakeFiles/llava-cli.dir/llava-cli.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/llava-cli.dir/llava-cli.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava-cli.cpp -o CMakeFiles/llava-cli.dir/llava-cli.s
+
+# Object files for target llava-cli
+llava__cli_OBJECTS = \
+"CMakeFiles/llava-cli.dir/llava-cli.o"
+
+# External object files for target llava-cli
+llava__cli_EXTERNAL_OBJECTS = \
+"/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/llava.o" \
+"/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/clip.o"
+
+llava/llava-cli: llava/CMakeFiles/llava-cli.dir/llava-cli.o
+llava/llava-cli: llava/CMakeFiles/llava.dir/llava.o
+llava/llava-cli: llava/CMakeFiles/llava.dir/clip.o
+llava/llava-cli: llava/CMakeFiles/llava-cli.dir/build.make
+llava/llava-cli: llava/CMakeFiles/llava-cli.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable llava-cli"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/llava-cli.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+llava/CMakeFiles/llava-cli.dir/build: llava/llava-cli
+.PHONY : llava/CMakeFiles/llava-cli.dir/build
+
+llava/CMakeFiles/llava-cli.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -P CMakeFiles/llava-cli.dir/cmake_clean.cmake
+.PHONY : llava/CMakeFiles/llava-cli.dir/clean
+
+llava/CMakeFiles/llava-cli.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : llava/CMakeFiles/llava-cli.dir/depend
+
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/cmake_clean.cmake b/examples/sycl/llava/CMakeFiles/llava-cli.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..9028260a0a143
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava-cli.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/llava-cli.dir/llava-cli.o"
+  "CMakeFiles/llava-cli.dir/llava-cli.o.d"
+  "llava-cli"
+  "llava-cli.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/llava-cli.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.make
new file mode 100644
index 0000000000000..0814eb432bb83
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for llava-cli.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.ts b/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..23b629e3f6a90
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for llava-cli.
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/depend.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/depend.make
new file mode 100644
index 0000000000000..42964b654d8bf
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava-cli.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for llava-cli.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/flags.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/flags.make
new file mode 100644
index 0000000000000..dd2a15593d7fa
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava-cli.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/. -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/../.. -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/../../common
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/link.txt b/examples/sycl/llava/CMakeFiles/llava-cli.dir/link.txt
new file mode 100644
index 0000000000000..2dc90433d77fe
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava-cli.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/llava-cli.dir/llava-cli.o CMakeFiles/llava.dir/llava.o CMakeFiles/llava.dir/clip.o -o llava-cli  -lcommon -lllama -lggml -lllama 
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/progress.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/progress.make
new file mode 100644
index 0000000000000..9fd0bf530ff91
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava-cli.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 25
+CMAKE_PROGRESS_2 = 26
+
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake b/examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..9612457064b23
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake
@@ -0,0 +1,20 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/llava/clip.cpp" "llava/CMakeFiles/llava.dir/clip.o" "gcc" "llava/CMakeFiles/llava.dir/clip.o.d"
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava.cpp" "llava/CMakeFiles/llava.dir/llava.o" "gcc" "llava/CMakeFiles/llava.dir/llava.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/build.make b/examples/sycl/llava/CMakeFiles/llava.dir/build.make
new file mode 100644
index 0000000000000..1750c09553cb0
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava.dir/build.make
@@ -0,0 +1,116 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include llava/CMakeFiles/llava.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include llava/CMakeFiles/llava.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include llava/CMakeFiles/llava.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include llava/CMakeFiles/llava.dir/flags.make
+
+llava/CMakeFiles/llava.dir/llava.o: llava/CMakeFiles/llava.dir/flags.make
+llava/CMakeFiles/llava.dir/llava.o: ../llava/llava.cpp
+llava/CMakeFiles/llava.dir/llava.o: llava/CMakeFiles/llava.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object llava/CMakeFiles/llava.dir/llava.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT llava/CMakeFiles/llava.dir/llava.o -MF CMakeFiles/llava.dir/llava.o.d -o CMakeFiles/llava.dir/llava.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava.cpp
+
+llava/CMakeFiles/llava.dir/llava.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/llava.dir/llava.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava.cpp > CMakeFiles/llava.dir/llava.i
+
+llava/CMakeFiles/llava.dir/llava.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/llava.dir/llava.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava.cpp -o CMakeFiles/llava.dir/llava.s
+
+llava/CMakeFiles/llava.dir/clip.o: llava/CMakeFiles/llava.dir/flags.make
+llava/CMakeFiles/llava.dir/clip.o: ../llava/clip.cpp
+llava/CMakeFiles/llava.dir/clip.o: llava/CMakeFiles/llava.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Building CXX object llava/CMakeFiles/llava.dir/clip.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT llava/CMakeFiles/llava.dir/clip.o -MF CMakeFiles/llava.dir/clip.o.d -o CMakeFiles/llava.dir/clip.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/llava/clip.cpp
+
+llava/CMakeFiles/llava.dir/clip.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/llava.dir/clip.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/llava/clip.cpp > CMakeFiles/llava.dir/clip.i
+
+llava/CMakeFiles/llava.dir/clip.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/llava.dir/clip.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/llava/clip.cpp -o CMakeFiles/llava.dir/clip.s
+
+llava: llava/CMakeFiles/llava.dir/llava.o
+llava: llava/CMakeFiles/llava.dir/clip.o
+llava: llava/CMakeFiles/llava.dir/build.make
+.PHONY : llava
+
+# Rule to build all files generated by this target.
+llava/CMakeFiles/llava.dir/build: llava
+.PHONY : llava/CMakeFiles/llava.dir/build
+
+llava/CMakeFiles/llava.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -P CMakeFiles/llava.dir/cmake_clean.cmake
+.PHONY : llava/CMakeFiles/llava.dir/clean
+
+llava/CMakeFiles/llava.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : llava/CMakeFiles/llava.dir/depend
+
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/cmake_clean.cmake b/examples/sycl/llava/CMakeFiles/llava.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..3883a0e373075
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/llava.dir/clip.o"
+  "CMakeFiles/llava.dir/clip.o.d"
+  "CMakeFiles/llava.dir/llava.o"
+  "CMakeFiles/llava.dir/llava.o.d"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/llava.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.make b/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.make
new file mode 100644
index 0000000000000..6dac2f664c5d8
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for llava.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.ts b/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..4e2caa7aebdcf
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for llava.
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/depend.make b/examples/sycl/llava/CMakeFiles/llava.dir/depend.make
new file mode 100644
index 0000000000000..c853fa62d1aa8
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for llava.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/flags.make b/examples/sycl/llava/CMakeFiles/llava.dir/flags.make
new file mode 100644
index 0000000000000..badb2e522d3d5
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/. -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/../.. -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/../../common
+
+CXX_FLAGS = -Wno-cast-qual
+
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/progress.make b/examples/sycl/llava/CMakeFiles/llava.dir/progress.make
new file mode 100644
index 0000000000000..6c29f4fb5e35d
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 23
+CMAKE_PROGRESS_2 = 24
+
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake b/examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..dc55e44b5556f
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake
@@ -0,0 +1,18 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/build.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/build.make
new file mode 100644
index 0000000000000..204fef459a2c5
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/build.make
@@ -0,0 +1,99 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include llava/CMakeFiles/llava_static.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include llava/CMakeFiles/llava_static.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include llava/CMakeFiles/llava_static.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include llava/CMakeFiles/llava_static.dir/flags.make
+
+# Object files for target llava_static
+llava_static_OBJECTS =
+
+# External object files for target llava_static
+llava_static_EXTERNAL_OBJECTS = \
+"/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/llava.o" \
+"/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/clip.o"
+
+llava/libllava_static.a: llava/CMakeFiles/llava.dir/llava.o
+llava/libllava_static.a: llava/CMakeFiles/llava.dir/clip.o
+llava/libllava_static.a: llava/CMakeFiles/llava_static.dir/build.make
+llava/libllava_static.a: llava/CMakeFiles/llava_static.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Linking CXX static library libllava_static.a"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -P CMakeFiles/llava_static.dir/cmake_clean_target.cmake
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/llava_static.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+llava/CMakeFiles/llava_static.dir/build: llava/libllava_static.a
+.PHONY : llava/CMakeFiles/llava_static.dir/build
+
+llava/CMakeFiles/llava_static.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -P CMakeFiles/llava_static.dir/cmake_clean.cmake
+.PHONY : llava/CMakeFiles/llava_static.dir/clean
+
+llava/CMakeFiles/llava_static.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : llava/CMakeFiles/llava_static.dir/depend
+
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean.cmake b/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..f45eae4621fb1
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean.cmake
@@ -0,0 +1,9 @@
+file(REMOVE_RECURSE
+  "libllava_static.a"
+  "libllava_static.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/llava_static.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean_target.cmake b/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean_target.cmake
new file mode 100644
index 0000000000000..811ad1cc67be8
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean_target.cmake
@@ -0,0 +1,3 @@
+file(REMOVE_RECURSE
+  "libllava_static.a"
+)
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.make
new file mode 100644
index 0000000000000..53d1fc0d078f6
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for llava_static.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.ts b/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..795a3979cf036
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for llava_static.
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/depend.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/depend.make
new file mode 100644
index 0000000000000..dbb3703db5596
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for llava_static.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/flags.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/link.txt b/examples/sycl/llava/CMakeFiles/llava_static.dir/link.txt
new file mode 100644
index 0000000000000..c3f37a40baa9a
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/link.txt
@@ -0,0 +1,2 @@
+/usr/bin/ar qc libllava_static.a CMakeFiles/llava.dir/llava.o CMakeFiles/llava.dir/clip.o
+/usr/bin/ranlib libllava_static.a
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/progress.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/progress.make
new file mode 100644
index 0000000000000..6e61838c631fb
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/llava_static.dir/progress.make
@@ -0,0 +1,2 @@
+CMAKE_PROGRESS_1 = 27
+
diff --git a/examples/sycl/llava/CMakeFiles/progress.marks b/examples/sycl/llava/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..7ed6ff82de6bc
--- /dev/null
+++ b/examples/sycl/llava/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+5
diff --git a/examples/sycl/llava/Makefile b/examples/sycl/llava/Makefile
new file mode 100644
index 0000000000000..fba626a99cbf4
--- /dev/null
+++ b/examples/sycl/llava/Makefile
@@ -0,0 +1,288 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+llava/CMakeFiles/llava.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava.dir/rule
+.PHONY : llava/CMakeFiles/llava.dir/rule
+
+# Convenience name for target.
+llava: llava/CMakeFiles/llava.dir/rule
+.PHONY : llava
+
+# fast build rule for target.
+llava/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/build
+.PHONY : llava/fast
+
+# Convenience name for target.
+llava/CMakeFiles/llava_static.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava_static.dir/rule
+.PHONY : llava/CMakeFiles/llava_static.dir/rule
+
+# Convenience name for target.
+llava_static: llava/CMakeFiles/llava_static.dir/rule
+.PHONY : llava_static
+
+# fast build rule for target.
+llava_static/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/build
+.PHONY : llava_static/fast
+
+# Convenience name for target.
+llava/CMakeFiles/llava-cli.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava-cli.dir/rule
+.PHONY : llava/CMakeFiles/llava-cli.dir/rule
+
+# Convenience name for target.
+llava-cli: llava/CMakeFiles/llava-cli.dir/rule
+.PHONY : llava-cli
+
+# fast build rule for target.
+llava-cli/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/build
+.PHONY : llava-cli/fast
+
+# target to build an object file
+clip.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/clip.o
+.PHONY : clip.o
+
+# target to preprocess a source file
+clip.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/clip.i
+.PHONY : clip.i
+
+# target to generate assembly for a file
+clip.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/clip.s
+.PHONY : clip.s
+
+# target to build an object file
+llava-cli.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/llava-cli.o
+.PHONY : llava-cli.o
+
+# target to preprocess a source file
+llava-cli.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/llava-cli.i
+.PHONY : llava-cli.i
+
+# target to generate assembly for a file
+llava-cli.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/llava-cli.s
+.PHONY : llava-cli.s
+
+# target to build an object file
+llava.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/llava.o
+.PHONY : llava.o
+
+# target to preprocess a source file
+llava.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/llava.i
+.PHONY : llava.i
+
+# target to generate assembly for a file
+llava.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/llava.s
+.PHONY : llava.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... llava"
+	@echo "... llava-cli"
+	@echo "... llava_static"
+	@echo "... clip.o"
+	@echo "... clip.i"
+	@echo "... clip.s"
+	@echo "... llava-cli.o"
+	@echo "... llava-cli.i"
+	@echo "... llava-cli.s"
+	@echo "... llava.o"
+	@echo "... llava.i"
+	@echo "... llava.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/llava/cmake_install.cmake b/examples/sycl/llava/cmake_install.cmake
new file mode 100644
index 0000000000000..babdc45902de3
--- /dev/null
+++ b/examples/sycl/llava/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/llava
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/llava-cli")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/lookahead/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/lookahead/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..275ad578cff31
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/lookahead/lookahead.cpp" "lookahead/CMakeFiles/lookahead.dir/lookahead.o" "gcc" "lookahead/CMakeFiles/lookahead.dir/lookahead.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/build.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/build.make
new file mode 100644
index 0000000000000..28e8b3553a5b5
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include lookahead/CMakeFiles/lookahead.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include lookahead/CMakeFiles/lookahead.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include lookahead/CMakeFiles/lookahead.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include lookahead/CMakeFiles/lookahead.dir/flags.make
+
+lookahead/CMakeFiles/lookahead.dir/lookahead.o: lookahead/CMakeFiles/lookahead.dir/flags.make
+lookahead/CMakeFiles/lookahead.dir/lookahead.o: ../lookahead/lookahead.cpp
+lookahead/CMakeFiles/lookahead.dir/lookahead.o: lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object lookahead/CMakeFiles/lookahead.dir/lookahead.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT lookahead/CMakeFiles/lookahead.dir/lookahead.o -MF CMakeFiles/lookahead.dir/lookahead.o.d -o CMakeFiles/lookahead.dir/lookahead.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead/lookahead.cpp
+
+lookahead/CMakeFiles/lookahead.dir/lookahead.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/lookahead.dir/lookahead.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead/lookahead.cpp > CMakeFiles/lookahead.dir/lookahead.i
+
+lookahead/CMakeFiles/lookahead.dir/lookahead.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/lookahead.dir/lookahead.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead/lookahead.cpp -o CMakeFiles/lookahead.dir/lookahead.s
+
+# Object files for target lookahead
+lookahead_OBJECTS = \
+"CMakeFiles/lookahead.dir/lookahead.o"
+
+# External object files for target lookahead
+lookahead_EXTERNAL_OBJECTS =
+
+lookahead/lookahead: lookahead/CMakeFiles/lookahead.dir/lookahead.o
+lookahead/lookahead: lookahead/CMakeFiles/lookahead.dir/build.make
+lookahead/lookahead: lookahead/CMakeFiles/lookahead.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable lookahead"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/lookahead.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+lookahead/CMakeFiles/lookahead.dir/build: lookahead/lookahead
+.PHONY : lookahead/CMakeFiles/lookahead.dir/build
+
+lookahead/CMakeFiles/lookahead.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && $(CMAKE_COMMAND) -P CMakeFiles/lookahead.dir/cmake_clean.cmake
+.PHONY : lookahead/CMakeFiles/lookahead.dir/clean
+
+lookahead/CMakeFiles/lookahead.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : lookahead/CMakeFiles/lookahead.dir/depend
+
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/cmake_clean.cmake b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..1784906a410e3
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/lookahead.dir/lookahead.o"
+  "CMakeFiles/lookahead.dir/lookahead.o.d"
+  "lookahead"
+  "lookahead.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/lookahead.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.make
new file mode 100644
index 0000000000000..e16e15b4ebd1c
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for lookahead.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..abdca07390ae2
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for lookahead.
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/depend.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/depend.make
new file mode 100644
index 0000000000000..0aaa54f625ea9
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for lookahead.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/flags.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/link.txt b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/link.txt
new file mode 100644
index 0000000000000..490524c8c691f
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/lookahead.dir/lookahead.o -o lookahead  -lcommon -lllama 
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/progress.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/progress.make
new file mode 100644
index 0000000000000..ec204d332ef87
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 28
+CMAKE_PROGRESS_2 = 29
+
diff --git a/examples/sycl/lookahead/CMakeFiles/progress.marks b/examples/sycl/lookahead/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/lookahead/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/lookahead/Makefile b/examples/sycl/lookahead/Makefile
new file mode 100644
index 0000000000000..b6028417d5939
--- /dev/null
+++ b/examples/sycl/lookahead/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+lookahead/CMakeFiles/lookahead.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/CMakeFiles/lookahead.dir/rule
+.PHONY : lookahead/CMakeFiles/lookahead.dir/rule
+
+# Convenience name for target.
+lookahead: lookahead/CMakeFiles/lookahead.dir/rule
+.PHONY : lookahead
+
+# fast build rule for target.
+lookahead/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/build
+.PHONY : lookahead/fast
+
+# target to build an object file
+lookahead.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/lookahead.o
+.PHONY : lookahead.o
+
+# target to preprocess a source file
+lookahead.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/lookahead.i
+.PHONY : lookahead.i
+
+# target to generate assembly for a file
+lookahead.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/lookahead.s
+.PHONY : lookahead.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... lookahead"
+	@echo "... lookahead.o"
+	@echo "... lookahead.i"
+	@echo "... lookahead.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/lookahead/cmake_install.cmake b/examples/sycl/lookahead/cmake_install.cmake
new file mode 100644
index 0000000000000..9a2af72130d76
--- /dev/null
+++ b/examples/sycl/lookahead/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/lookahead")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/lookup/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/lookup/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake b/examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..235904a16eade
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/lookup/lookup.cpp" "lookup/CMakeFiles/lookup.dir/lookup.o" "gcc" "lookup/CMakeFiles/lookup.dir/lookup.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/build.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/build.make
new file mode 100644
index 0000000000000..4870a818310f0
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/lookup.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include lookup/CMakeFiles/lookup.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include lookup/CMakeFiles/lookup.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include lookup/CMakeFiles/lookup.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include lookup/CMakeFiles/lookup.dir/flags.make
+
+lookup/CMakeFiles/lookup.dir/lookup.o: lookup/CMakeFiles/lookup.dir/flags.make
+lookup/CMakeFiles/lookup.dir/lookup.o: ../lookup/lookup.cpp
+lookup/CMakeFiles/lookup.dir/lookup.o: lookup/CMakeFiles/lookup.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object lookup/CMakeFiles/lookup.dir/lookup.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT lookup/CMakeFiles/lookup.dir/lookup.o -MF CMakeFiles/lookup.dir/lookup.o.d -o CMakeFiles/lookup.dir/lookup.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/lookup/lookup.cpp
+
+lookup/CMakeFiles/lookup.dir/lookup.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/lookup.dir/lookup.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/lookup/lookup.cpp > CMakeFiles/lookup.dir/lookup.i
+
+lookup/CMakeFiles/lookup.dir/lookup.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/lookup.dir/lookup.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/lookup/lookup.cpp -o CMakeFiles/lookup.dir/lookup.s
+
+# Object files for target lookup
+lookup_OBJECTS = \
+"CMakeFiles/lookup.dir/lookup.o"
+
+# External object files for target lookup
+lookup_EXTERNAL_OBJECTS =
+
+lookup/lookup: lookup/CMakeFiles/lookup.dir/lookup.o
+lookup/lookup: lookup/CMakeFiles/lookup.dir/build.make
+lookup/lookup: lookup/CMakeFiles/lookup.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable lookup"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/lookup.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+lookup/CMakeFiles/lookup.dir/build: lookup/lookup
+.PHONY : lookup/CMakeFiles/lookup.dir/build
+
+lookup/CMakeFiles/lookup.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && $(CMAKE_COMMAND) -P CMakeFiles/lookup.dir/cmake_clean.cmake
+.PHONY : lookup/CMakeFiles/lookup.dir/clean
+
+lookup/CMakeFiles/lookup.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/lookup /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : lookup/CMakeFiles/lookup.dir/depend
+
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/cmake_clean.cmake b/examples/sycl/lookup/CMakeFiles/lookup.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..395eb958ce111
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/lookup.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/lookup.dir/lookup.o"
+  "CMakeFiles/lookup.dir/lookup.o.d"
+  "lookup"
+  "lookup.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/lookup.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.make
new file mode 100644
index 0000000000000..ee5894814a53c
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for lookup.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.ts b/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..7fcb2448b23bf
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for lookup.
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/depend.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/depend.make
new file mode 100644
index 0000000000000..806a9a3b1639d
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/lookup.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for lookup.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/flags.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/lookup.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/link.txt b/examples/sycl/lookup/CMakeFiles/lookup.dir/link.txt
new file mode 100644
index 0000000000000..7f181c941a9c8
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/lookup.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/lookup.dir/lookup.o -o lookup  -lcommon -lllama 
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/progress.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/progress.make
new file mode 100644
index 0000000000000..335ef43aa78f1
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/lookup.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 30
+CMAKE_PROGRESS_2 = 31
+
diff --git a/examples/sycl/lookup/CMakeFiles/progress.marks b/examples/sycl/lookup/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/lookup/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/lookup/Makefile b/examples/sycl/lookup/Makefile
new file mode 100644
index 0000000000000..d125b37f53c62
--- /dev/null
+++ b/examples/sycl/lookup/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+lookup/CMakeFiles/lookup.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/CMakeFiles/lookup.dir/rule
+.PHONY : lookup/CMakeFiles/lookup.dir/rule
+
+# Convenience name for target.
+lookup: lookup/CMakeFiles/lookup.dir/rule
+.PHONY : lookup
+
+# fast build rule for target.
+lookup/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/build
+.PHONY : lookup/fast
+
+# target to build an object file
+lookup.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/lookup.o
+.PHONY : lookup.o
+
+# target to preprocess a source file
+lookup.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/lookup.i
+.PHONY : lookup.i
+
+# target to generate assembly for a file
+lookup.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/lookup.s
+.PHONY : lookup.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... lookup"
+	@echo "... lookup.o"
+	@echo "... lookup.i"
+	@echo "... lookup.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/lookup/cmake_install.cmake b/examples/sycl/lookup/cmake_install.cmake
new file mode 100644
index 0000000000000..0c81d00f52886
--- /dev/null
+++ b/examples/sycl/lookup/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/lookup
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/lookup")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp
new file mode 100644
index 0000000000000..619bee9de33ec
--- /dev/null
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -0,0 +1,60 @@
+/*
+ * #include "common.h"
+
+#include "console.h"
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <vector>
+*/
+
+#include "ggml-sycl.h"
+
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+#include <signal.h>
+#include <unistd.h>
+#elif defined (_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#include <signal.h>
+#endif
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+
+
+void print_sycl_devices(){
+    int device_count = dpct::dev_mgr::instance().device_count();
+    fprintf(stderr, "found %d SYCL devices:\n", device_count);
+    for (int id = 0; id < device_count; ++id) {
+        dpct::device_info prop;
+        dpct::get_device_info(
+            prop, dpct::dev_mgr::instance().get_device(id));
+        sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
+        fprintf(stderr, "  Device %d: %s,\tcompute capability %d.%d,\n\tmax compute_units %d,\tmax work group size %d,\tmax sub group size %d,\tglobal mem size %lu\n", id,
+                prop.get_name(), prop.get_major_version(),
+                prop.get_minor_version(),
+                prop.get_max_compute_units(),
+                prop.get_max_work_group_size(),
+                prop.get_max_sub_group_size(),
+                prop.get_global_mem_size()
+                );
+    }
+}
+
+int main(int argc, char ** argv) {
+    print_sycl_devices();
+    return 0;
+}
diff --git a/examples/sycl/main/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/main/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake b/examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..397618f7b1c3b
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/main/main.cpp" "main/CMakeFiles/main.dir/main.o" "gcc" "main/CMakeFiles/main.dir/main.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/main/CMakeFiles/main.dir/build.make b/examples/sycl/main/CMakeFiles/main.dir/build.make
new file mode 100644
index 0000000000000..048af40ba5b24
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/main.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include main/CMakeFiles/main.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include main/CMakeFiles/main.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include main/CMakeFiles/main.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include main/CMakeFiles/main.dir/flags.make
+
+main/CMakeFiles/main.dir/main.o: main/CMakeFiles/main.dir/flags.make
+main/CMakeFiles/main.dir/main.o: ../main/main.cpp
+main/CMakeFiles/main.dir/main.o: main/CMakeFiles/main.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object main/CMakeFiles/main.dir/main.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT main/CMakeFiles/main.dir/main.o -MF CMakeFiles/main.dir/main.o.d -o CMakeFiles/main.dir/main.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/main/main.cpp
+
+main/CMakeFiles/main.dir/main.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/main.dir/main.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/main/main.cpp > CMakeFiles/main.dir/main.i
+
+main/CMakeFiles/main.dir/main.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/main.dir/main.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/main/main.cpp -o CMakeFiles/main.dir/main.s
+
+# Object files for target main
+main_OBJECTS = \
+"CMakeFiles/main.dir/main.o"
+
+# External object files for target main
+main_EXTERNAL_OBJECTS =
+
+main/main: main/CMakeFiles/main.dir/main.o
+main/main: main/CMakeFiles/main.dir/build.make
+main/main: main/CMakeFiles/main.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable main"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/main.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+main/CMakeFiles/main.dir/build: main/main
+.PHONY : main/CMakeFiles/main.dir/build
+
+main/CMakeFiles/main.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && $(CMAKE_COMMAND) -P CMakeFiles/main.dir/cmake_clean.cmake
+.PHONY : main/CMakeFiles/main.dir/clean
+
+main/CMakeFiles/main.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/main /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : main/CMakeFiles/main.dir/depend
+
diff --git a/examples/sycl/main/CMakeFiles/main.dir/cmake_clean.cmake b/examples/sycl/main/CMakeFiles/main.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..f408648c7af77
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/main.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/main.dir/main.o"
+  "CMakeFiles/main.dir/main.o.d"
+  "main"
+  "main.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/main.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.make b/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.make
new file mode 100644
index 0000000000000..386f0a8f7a3c6
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for main.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.ts b/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..a3467f4e27f70
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for main.
diff --git a/examples/sycl/main/CMakeFiles/main.dir/depend.make b/examples/sycl/main/CMakeFiles/main.dir/depend.make
new file mode 100644
index 0000000000000..7e75fee722d29
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/main.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for main.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/main/CMakeFiles/main.dir/flags.make b/examples/sycl/main/CMakeFiles/main.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/main.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/main/CMakeFiles/main.dir/link.txt b/examples/sycl/main/CMakeFiles/main.dir/link.txt
new file mode 100644
index 0000000000000..0127ad4f185dd
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/main.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/main.dir/main.o -o main  -lcommon -lllama 
diff --git a/examples/sycl/main/CMakeFiles/main.dir/progress.make b/examples/sycl/main/CMakeFiles/main.dir/progress.make
new file mode 100644
index 0000000000000..30c309113cd04
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/main.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 34
+CMAKE_PROGRESS_2 = 35
+
diff --git a/examples/sycl/main/CMakeFiles/progress.marks b/examples/sycl/main/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/main/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/main/Makefile b/examples/sycl/main/Makefile
new file mode 100644
index 0000000000000..bcd6c6c9391dc
--- /dev/null
+++ b/examples/sycl/main/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+main/CMakeFiles/main.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/CMakeFiles/main.dir/rule
+.PHONY : main/CMakeFiles/main.dir/rule
+
+# Convenience name for target.
+main: main/CMakeFiles/main.dir/rule
+.PHONY : main
+
+# fast build rule for target.
+main/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/build
+.PHONY : main/fast
+
+# target to build an object file
+main.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/main.o
+.PHONY : main.o
+
+# target to preprocess a source file
+main.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/main.i
+.PHONY : main.i
+
+# target to generate assembly for a file
+main.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/main.s
+.PHONY : main.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... main"
+	@echo "... main.o"
+	@echo "... main.i"
+	@echo "... main.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/main/cmake_install.cmake b/examples/sycl/main/cmake_install.cmake
new file mode 100644
index 0000000000000..dcec227e3f9d0
--- /dev/null
+++ b/examples/sycl/main/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/main
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/main")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/parallel/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/parallel/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake b/examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..6e1d429f7cd8f
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/parallel/parallel.cpp" "parallel/CMakeFiles/parallel.dir/parallel.o" "gcc" "parallel/CMakeFiles/parallel.dir/parallel.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/build.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/build.make
new file mode 100644
index 0000000000000..c982dff8926cc
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/parallel.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include parallel/CMakeFiles/parallel.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include parallel/CMakeFiles/parallel.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include parallel/CMakeFiles/parallel.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include parallel/CMakeFiles/parallel.dir/flags.make
+
+parallel/CMakeFiles/parallel.dir/parallel.o: parallel/CMakeFiles/parallel.dir/flags.make
+parallel/CMakeFiles/parallel.dir/parallel.o: ../parallel/parallel.cpp
+parallel/CMakeFiles/parallel.dir/parallel.o: parallel/CMakeFiles/parallel.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object parallel/CMakeFiles/parallel.dir/parallel.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT parallel/CMakeFiles/parallel.dir/parallel.o -MF CMakeFiles/parallel.dir/parallel.o.d -o CMakeFiles/parallel.dir/parallel.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/parallel/parallel.cpp
+
+parallel/CMakeFiles/parallel.dir/parallel.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/parallel.dir/parallel.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/parallel/parallel.cpp > CMakeFiles/parallel.dir/parallel.i
+
+parallel/CMakeFiles/parallel.dir/parallel.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/parallel.dir/parallel.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/parallel/parallel.cpp -o CMakeFiles/parallel.dir/parallel.s
+
+# Object files for target parallel
+parallel_OBJECTS = \
+"CMakeFiles/parallel.dir/parallel.o"
+
+# External object files for target parallel
+parallel_EXTERNAL_OBJECTS =
+
+parallel/parallel: parallel/CMakeFiles/parallel.dir/parallel.o
+parallel/parallel: parallel/CMakeFiles/parallel.dir/build.make
+parallel/parallel: parallel/CMakeFiles/parallel.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable parallel"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/parallel.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+parallel/CMakeFiles/parallel.dir/build: parallel/parallel
+.PHONY : parallel/CMakeFiles/parallel.dir/build
+
+parallel/CMakeFiles/parallel.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && $(CMAKE_COMMAND) -P CMakeFiles/parallel.dir/cmake_clean.cmake
+.PHONY : parallel/CMakeFiles/parallel.dir/clean
+
+parallel/CMakeFiles/parallel.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/parallel /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : parallel/CMakeFiles/parallel.dir/depend
+
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/cmake_clean.cmake b/examples/sycl/parallel/CMakeFiles/parallel.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..88fdf52287490
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/parallel.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/parallel.dir/parallel.o"
+  "CMakeFiles/parallel.dir/parallel.o.d"
+  "parallel"
+  "parallel.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/parallel.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.make
new file mode 100644
index 0000000000000..4adb5e4069cbb
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for parallel.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.ts b/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..4fc0c95335bab
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for parallel.
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/depend.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/depend.make
new file mode 100644
index 0000000000000..bb47426afbbe8
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/parallel.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for parallel.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/flags.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/parallel.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/link.txt b/examples/sycl/parallel/CMakeFiles/parallel.dir/link.txt
new file mode 100644
index 0000000000000..d242b572462cd
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/parallel.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/parallel.dir/parallel.o -o parallel  -lcommon -lllama 
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/progress.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/progress.make
new file mode 100644
index 0000000000000..2178e35f72e84
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/parallel.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 36
+CMAKE_PROGRESS_2 = 37
+
diff --git a/examples/sycl/parallel/CMakeFiles/progress.marks b/examples/sycl/parallel/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/parallel/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/parallel/Makefile b/examples/sycl/parallel/Makefile
new file mode 100644
index 0000000000000..06e1de1287b9b
--- /dev/null
+++ b/examples/sycl/parallel/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+parallel/CMakeFiles/parallel.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/CMakeFiles/parallel.dir/rule
+.PHONY : parallel/CMakeFiles/parallel.dir/rule
+
+# Convenience name for target.
+parallel: parallel/CMakeFiles/parallel.dir/rule
+.PHONY : parallel
+
+# fast build rule for target.
+parallel/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/build
+.PHONY : parallel/fast
+
+# target to build an object file
+parallel.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/parallel.o
+.PHONY : parallel.o
+
+# target to preprocess a source file
+parallel.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/parallel.i
+.PHONY : parallel.i
+
+# target to generate assembly for a file
+parallel.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/parallel.s
+.PHONY : parallel.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... parallel"
+	@echo "... parallel.o"
+	@echo "... parallel.i"
+	@echo "... parallel.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/parallel/cmake_install.cmake b/examples/sycl/parallel/cmake_install.cmake
new file mode 100644
index 0000000000000..495382e76897a
--- /dev/null
+++ b/examples/sycl/parallel/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/parallel
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/parallel")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/perplexity/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/perplexity/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..66b8b5f1ba73e
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/perplexity/perplexity.cpp" "perplexity/CMakeFiles/perplexity.dir/perplexity.o" "gcc" "perplexity/CMakeFiles/perplexity.dir/perplexity.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/build.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/build.make
new file mode 100644
index 0000000000000..85b10e61c975f
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include perplexity/CMakeFiles/perplexity.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include perplexity/CMakeFiles/perplexity.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include perplexity/CMakeFiles/perplexity.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include perplexity/CMakeFiles/perplexity.dir/flags.make
+
+perplexity/CMakeFiles/perplexity.dir/perplexity.o: perplexity/CMakeFiles/perplexity.dir/flags.make
+perplexity/CMakeFiles/perplexity.dir/perplexity.o: ../perplexity/perplexity.cpp
+perplexity/CMakeFiles/perplexity.dir/perplexity.o: perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object perplexity/CMakeFiles/perplexity.dir/perplexity.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT perplexity/CMakeFiles/perplexity.dir/perplexity.o -MF CMakeFiles/perplexity.dir/perplexity.o.d -o CMakeFiles/perplexity.dir/perplexity.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity/perplexity.cpp
+
+perplexity/CMakeFiles/perplexity.dir/perplexity.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/perplexity.dir/perplexity.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity/perplexity.cpp > CMakeFiles/perplexity.dir/perplexity.i
+
+perplexity/CMakeFiles/perplexity.dir/perplexity.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/perplexity.dir/perplexity.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity/perplexity.cpp -o CMakeFiles/perplexity.dir/perplexity.s
+
+# Object files for target perplexity
+perplexity_OBJECTS = \
+"CMakeFiles/perplexity.dir/perplexity.o"
+
+# External object files for target perplexity
+perplexity_EXTERNAL_OBJECTS =
+
+perplexity/perplexity: perplexity/CMakeFiles/perplexity.dir/perplexity.o
+perplexity/perplexity: perplexity/CMakeFiles/perplexity.dir/build.make
+perplexity/perplexity: perplexity/CMakeFiles/perplexity.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable perplexity"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/perplexity.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+perplexity/CMakeFiles/perplexity.dir/build: perplexity/perplexity
+.PHONY : perplexity/CMakeFiles/perplexity.dir/build
+
+perplexity/CMakeFiles/perplexity.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && $(CMAKE_COMMAND) -P CMakeFiles/perplexity.dir/cmake_clean.cmake
+.PHONY : perplexity/CMakeFiles/perplexity.dir/clean
+
+perplexity/CMakeFiles/perplexity.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : perplexity/CMakeFiles/perplexity.dir/depend
+
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/cmake_clean.cmake b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..b4c2d0a0102e5
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/perplexity.dir/perplexity.o"
+  "CMakeFiles/perplexity.dir/perplexity.o.d"
+  "perplexity"
+  "perplexity.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/perplexity.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.make
new file mode 100644
index 0000000000000..5bde1b10339f0
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for perplexity.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..5fe121b3b05c1
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for perplexity.
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/depend.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/depend.make
new file mode 100644
index 0000000000000..405206272bc89
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for perplexity.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/flags.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/link.txt b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/link.txt
new file mode 100644
index 0000000000000..c08ea7dca9943
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/perplexity.dir/perplexity.o -o perplexity  -lcommon -lllama 
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/progress.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/progress.make
new file mode 100644
index 0000000000000..ad2d357457759
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 38
+CMAKE_PROGRESS_2 = 39
+
diff --git a/examples/sycl/perplexity/CMakeFiles/progress.marks b/examples/sycl/perplexity/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/perplexity/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/perplexity/Makefile b/examples/sycl/perplexity/Makefile
new file mode 100644
index 0000000000000..0eefcb75e12a6
--- /dev/null
+++ b/examples/sycl/perplexity/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+perplexity/CMakeFiles/perplexity.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/CMakeFiles/perplexity.dir/rule
+.PHONY : perplexity/CMakeFiles/perplexity.dir/rule
+
+# Convenience name for target.
+perplexity: perplexity/CMakeFiles/perplexity.dir/rule
+.PHONY : perplexity
+
+# fast build rule for target.
+perplexity/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/build
+.PHONY : perplexity/fast
+
+# target to build an object file
+perplexity.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/perplexity.o
+.PHONY : perplexity.o
+
+# target to preprocess a source file
+perplexity.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/perplexity.i
+.PHONY : perplexity.i
+
+# target to generate assembly for a file
+perplexity.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/perplexity.s
+.PHONY : perplexity.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... perplexity"
+	@echo "... perplexity.o"
+	@echo "... perplexity.i"
+	@echo "... perplexity.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/perplexity/cmake_install.cmake b/examples/sycl/perplexity/cmake_install.cmake
new file mode 100644
index 0000000000000..7ab46c66fc764
--- /dev/null
+++ b/examples/sycl/perplexity/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/perplexity")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/quantize-stats/CMakeFiles/progress.marks b/examples/sycl/quantize-stats/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..2adf5eb27255b
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/quantize-stats.cpp" "quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o" "gcc" "quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/build.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/build.make
new file mode 100644
index 0000000000000..b0f7750b26259
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include quantize-stats/CMakeFiles/quantize-stats.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include quantize-stats/CMakeFiles/quantize-stats.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
+
+quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o: quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
+quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o: ../quantize-stats/quantize-stats.cpp
+quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o: quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o -MF CMakeFiles/quantize-stats.dir/quantize-stats.o.d -o CMakeFiles/quantize-stats.dir/quantize-stats.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/quantize-stats.cpp
+
+quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/quantize-stats.dir/quantize-stats.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/quantize-stats.cpp > CMakeFiles/quantize-stats.dir/quantize-stats.i
+
+quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/quantize-stats.dir/quantize-stats.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/quantize-stats.cpp -o CMakeFiles/quantize-stats.dir/quantize-stats.s
+
+# Object files for target quantize-stats
+quantize__stats_OBJECTS = \
+"CMakeFiles/quantize-stats.dir/quantize-stats.o"
+
+# External object files for target quantize-stats
+quantize__stats_EXTERNAL_OBJECTS =
+
+quantize-stats/quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o
+quantize-stats/quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/build.make
+quantize-stats/quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable quantize-stats"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/quantize-stats.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+quantize-stats/CMakeFiles/quantize-stats.dir/build: quantize-stats/quantize-stats
+.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/build
+
+quantize-stats/CMakeFiles/quantize-stats.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && $(CMAKE_COMMAND) -P CMakeFiles/quantize-stats.dir/cmake_clean.cmake
+.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/clean
+
+quantize-stats/CMakeFiles/quantize-stats.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/depend
+
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/cmake_clean.cmake b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..3561c9f782621
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/quantize-stats.dir/quantize-stats.o"
+  "CMakeFiles/quantize-stats.dir/quantize-stats.o.d"
+  "quantize-stats"
+  "quantize-stats.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/quantize-stats.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make
new file mode 100644
index 0000000000000..9dc6febaed86d
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for quantize-stats.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..651269eca6c8d
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for quantize-stats.
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/depend.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/depend.make
new file mode 100644
index 0000000000000..107cbdfe92549
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for quantize-stats.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/flags.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
new file mode 100644
index 0000000000000..94ae3d85e6a12
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/../../common
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/link.txt b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/link.txt
new file mode 100644
index 0000000000000..fc2253c853144
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/quantize-stats.dir/quantize-stats.o -o quantize-stats  -lllama -lbuild_info 
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/progress.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/progress.make
new file mode 100644
index 0000000000000..0ae216536520f
--- /dev/null
+++ b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 42
+CMAKE_PROGRESS_2 = 43
+
diff --git a/examples/sycl/quantize-stats/Makefile b/examples/sycl/quantize-stats/Makefile
new file mode 100644
index 0000000000000..96034d833233a
--- /dev/null
+++ b/examples/sycl/quantize-stats/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+quantize-stats/CMakeFiles/quantize-stats.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/CMakeFiles/quantize-stats.dir/rule
+.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/rule
+
+# Convenience name for target.
+quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/rule
+.PHONY : quantize-stats
+
+# fast build rule for target.
+quantize-stats/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/build
+.PHONY : quantize-stats/fast
+
+# target to build an object file
+quantize-stats.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o
+.PHONY : quantize-stats.o
+
+# target to preprocess a source file
+quantize-stats.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.i
+.PHONY : quantize-stats.i
+
+# target to generate assembly for a file
+quantize-stats.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.s
+.PHONY : quantize-stats.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... quantize-stats"
+	@echo "... quantize-stats.o"
+	@echo "... quantize-stats.i"
+	@echo "... quantize-stats.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/quantize-stats/cmake_install.cmake b/examples/sycl/quantize-stats/cmake_install.cmake
new file mode 100644
index 0000000000000..0be2f85c929db
--- /dev/null
+++ b/examples/sycl/quantize-stats/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/quantize-stats")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/quantize/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/quantize/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/quantize/CMakeFiles/progress.marks b/examples/sycl/quantize/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake b/examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..f61118e2cf827
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/quantize/quantize.cpp" "quantize/CMakeFiles/quantize.dir/quantize.o" "gcc" "quantize/CMakeFiles/quantize.dir/quantize.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/build.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/build.make
new file mode 100644
index 0000000000000..bdc0585c11831
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/quantize.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include quantize/CMakeFiles/quantize.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include quantize/CMakeFiles/quantize.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include quantize/CMakeFiles/quantize.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include quantize/CMakeFiles/quantize.dir/flags.make
+
+quantize/CMakeFiles/quantize.dir/quantize.o: quantize/CMakeFiles/quantize.dir/flags.make
+quantize/CMakeFiles/quantize.dir/quantize.o: ../quantize/quantize.cpp
+quantize/CMakeFiles/quantize.dir/quantize.o: quantize/CMakeFiles/quantize.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object quantize/CMakeFiles/quantize.dir/quantize.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT quantize/CMakeFiles/quantize.dir/quantize.o -MF CMakeFiles/quantize.dir/quantize.o.d -o CMakeFiles/quantize.dir/quantize.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/quantize/quantize.cpp
+
+quantize/CMakeFiles/quantize.dir/quantize.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/quantize.dir/quantize.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/quantize/quantize.cpp > CMakeFiles/quantize.dir/quantize.i
+
+quantize/CMakeFiles/quantize.dir/quantize.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/quantize.dir/quantize.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/quantize/quantize.cpp -o CMakeFiles/quantize.dir/quantize.s
+
+# Object files for target quantize
+quantize_OBJECTS = \
+"CMakeFiles/quantize.dir/quantize.o"
+
+# External object files for target quantize
+quantize_EXTERNAL_OBJECTS =
+
+quantize/quantize: quantize/CMakeFiles/quantize.dir/quantize.o
+quantize/quantize: quantize/CMakeFiles/quantize.dir/build.make
+quantize/quantize: quantize/CMakeFiles/quantize.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable quantize"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/quantize.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+quantize/CMakeFiles/quantize.dir/build: quantize/quantize
+.PHONY : quantize/CMakeFiles/quantize.dir/build
+
+quantize/CMakeFiles/quantize.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && $(CMAKE_COMMAND) -P CMakeFiles/quantize.dir/cmake_clean.cmake
+.PHONY : quantize/CMakeFiles/quantize.dir/clean
+
+quantize/CMakeFiles/quantize.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/quantize /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : quantize/CMakeFiles/quantize.dir/depend
+
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/cmake_clean.cmake b/examples/sycl/quantize/CMakeFiles/quantize.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..c1f5c99ba49e6
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/quantize.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/quantize.dir/quantize.o"
+  "CMakeFiles/quantize.dir/quantize.o.d"
+  "quantize"
+  "quantize.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/quantize.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.make
new file mode 100644
index 0000000000000..7c3c9a8f56920
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for quantize.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.ts b/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..47e3bfba7fda6
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for quantize.
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/depend.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/depend.make
new file mode 100644
index 0000000000000..7ce880b4e6ff6
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/quantize.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for quantize.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/flags.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/flags.make
new file mode 100644
index 0000000000000..e2c15ffddb1bd
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/quantize.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/quantize/../../common
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/link.txt b/examples/sycl/quantize/CMakeFiles/quantize.dir/link.txt
new file mode 100644
index 0000000000000..bbe5e4877c944
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/quantize.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/quantize.dir/quantize.o -o quantize  -lllama -lbuild_info 
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/progress.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/progress.make
new file mode 100644
index 0000000000000..eb247b0834970
--- /dev/null
+++ b/examples/sycl/quantize/CMakeFiles/quantize.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 40
+CMAKE_PROGRESS_2 = 41
+
diff --git a/examples/sycl/quantize/Makefile b/examples/sycl/quantize/Makefile
new file mode 100644
index 0000000000000..d2ba6116c01c4
--- /dev/null
+++ b/examples/sycl/quantize/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+quantize/CMakeFiles/quantize.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/CMakeFiles/quantize.dir/rule
+.PHONY : quantize/CMakeFiles/quantize.dir/rule
+
+# Convenience name for target.
+quantize: quantize/CMakeFiles/quantize.dir/rule
+.PHONY : quantize
+
+# fast build rule for target.
+quantize/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/build
+.PHONY : quantize/fast
+
+# target to build an object file
+quantize.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/quantize.o
+.PHONY : quantize.o
+
+# target to preprocess a source file
+quantize.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/quantize.i
+.PHONY : quantize.i
+
+# target to generate assembly for a file
+quantize.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/quantize.s
+.PHONY : quantize.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... quantize"
+	@echo "... quantize.o"
+	@echo "... quantize.i"
+	@echo "... quantize.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/quantize/cmake_install.cmake b/examples/sycl/quantize/cmake_install.cmake
new file mode 100644
index 0000000000000..b809c4e4a77cf
--- /dev/null
+++ b/examples/sycl/quantize/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/quantize
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/quantize")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/save-load-state/CMakeFiles/progress.marks b/examples/sycl/save-load-state/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..9fc824e9a7976
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state/save-load-state.cpp" "save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o" "gcc" "save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/build.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/build.make
new file mode 100644
index 0000000000000..ac2b2a2609fee
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include save-load-state/CMakeFiles/save-load-state.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include save-load-state/CMakeFiles/save-load-state.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include save-load-state/CMakeFiles/save-load-state.dir/flags.make
+
+save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o: save-load-state/CMakeFiles/save-load-state.dir/flags.make
+save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o: ../save-load-state/save-load-state.cpp
+save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o: save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o -MF CMakeFiles/save-load-state.dir/save-load-state.o.d -o CMakeFiles/save-load-state.dir/save-load-state.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state/save-load-state.cpp
+
+save-load-state/CMakeFiles/save-load-state.dir/save-load-state.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/save-load-state.dir/save-load-state.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state/save-load-state.cpp > CMakeFiles/save-load-state.dir/save-load-state.i
+
+save-load-state/CMakeFiles/save-load-state.dir/save-load-state.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/save-load-state.dir/save-load-state.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state/save-load-state.cpp -o CMakeFiles/save-load-state.dir/save-load-state.s
+
+# Object files for target save-load-state
+save__load__state_OBJECTS = \
+"CMakeFiles/save-load-state.dir/save-load-state.o"
+
+# External object files for target save-load-state
+save__load__state_EXTERNAL_OBJECTS =
+
+save-load-state/save-load-state: save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o
+save-load-state/save-load-state: save-load-state/CMakeFiles/save-load-state.dir/build.make
+save-load-state/save-load-state: save-load-state/CMakeFiles/save-load-state.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable save-load-state"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/save-load-state.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+save-load-state/CMakeFiles/save-load-state.dir/build: save-load-state/save-load-state
+.PHONY : save-load-state/CMakeFiles/save-load-state.dir/build
+
+save-load-state/CMakeFiles/save-load-state.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && $(CMAKE_COMMAND) -P CMakeFiles/save-load-state.dir/cmake_clean.cmake
+.PHONY : save-load-state/CMakeFiles/save-load-state.dir/clean
+
+save-load-state/CMakeFiles/save-load-state.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : save-load-state/CMakeFiles/save-load-state.dir/depend
+
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/cmake_clean.cmake b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..af5b89fcb9eca
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/save-load-state.dir/save-load-state.o"
+  "CMakeFiles/save-load-state.dir/save-load-state.o.d"
+  "save-load-state"
+  "save-load-state.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/save-load-state.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make
new file mode 100644
index 0000000000000..a4577943f7e2d
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for save-load-state.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..64d7612a4a24d
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for save-load-state.
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/depend.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/depend.make
new file mode 100644
index 0000000000000..f99caaf712a32
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for save-load-state.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/flags.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/link.txt b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/link.txt
new file mode 100644
index 0000000000000..b1324b558d89e
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/save-load-state.dir/save-load-state.o -o save-load-state  -lcommon -lllama 
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/progress.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/progress.make
new file mode 100644
index 0000000000000..c51eef3db77b0
--- /dev/null
+++ b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 44
+CMAKE_PROGRESS_2 = 45
+
diff --git a/examples/sycl/save-load-state/Makefile b/examples/sycl/save-load-state/Makefile
new file mode 100644
index 0000000000000..c9509ab5c7c91
--- /dev/null
+++ b/examples/sycl/save-load-state/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+save-load-state/CMakeFiles/save-load-state.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/CMakeFiles/save-load-state.dir/rule
+.PHONY : save-load-state/CMakeFiles/save-load-state.dir/rule
+
+# Convenience name for target.
+save-load-state: save-load-state/CMakeFiles/save-load-state.dir/rule
+.PHONY : save-load-state
+
+# fast build rule for target.
+save-load-state/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/build
+.PHONY : save-load-state/fast
+
+# target to build an object file
+save-load-state.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o
+.PHONY : save-load-state.o
+
+# target to preprocess a source file
+save-load-state.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/save-load-state.i
+.PHONY : save-load-state.i
+
+# target to generate assembly for a file
+save-load-state.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/save-load-state.s
+.PHONY : save-load-state.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... save-load-state"
+	@echo "... save-load-state.o"
+	@echo "... save-load-state.i"
+	@echo "... save-load-state.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/save-load-state/cmake_install.cmake b/examples/sycl/save-load-state/cmake_install.cmake
new file mode 100644
index 0000000000000..6fa691a41e16e
--- /dev/null
+++ b/examples/sycl/save-load-state/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/save-load-state")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/simple/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/simple/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/simple/CMakeFiles/progress.marks b/examples/sycl/simple/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake b/examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..9d85da155f74d
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/simple/simple.cpp" "simple/CMakeFiles/simple.dir/simple.o" "gcc" "simple/CMakeFiles/simple.dir/simple.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/build.make b/examples/sycl/simple/CMakeFiles/simple.dir/build.make
new file mode 100644
index 0000000000000..a6aa4f6338188
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/simple.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include simple/CMakeFiles/simple.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include simple/CMakeFiles/simple.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include simple/CMakeFiles/simple.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include simple/CMakeFiles/simple.dir/flags.make
+
+simple/CMakeFiles/simple.dir/simple.o: simple/CMakeFiles/simple.dir/flags.make
+simple/CMakeFiles/simple.dir/simple.o: ../simple/simple.cpp
+simple/CMakeFiles/simple.dir/simple.o: simple/CMakeFiles/simple.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object simple/CMakeFiles/simple.dir/simple.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT simple/CMakeFiles/simple.dir/simple.o -MF CMakeFiles/simple.dir/simple.o.d -o CMakeFiles/simple.dir/simple.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/simple/simple.cpp
+
+simple/CMakeFiles/simple.dir/simple.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/simple.dir/simple.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/simple/simple.cpp > CMakeFiles/simple.dir/simple.i
+
+simple/CMakeFiles/simple.dir/simple.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/simple.dir/simple.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/simple/simple.cpp -o CMakeFiles/simple.dir/simple.s
+
+# Object files for target simple
+simple_OBJECTS = \
+"CMakeFiles/simple.dir/simple.o"
+
+# External object files for target simple
+simple_EXTERNAL_OBJECTS =
+
+simple/simple: simple/CMakeFiles/simple.dir/simple.o
+simple/simple: simple/CMakeFiles/simple.dir/build.make
+simple/simple: simple/CMakeFiles/simple.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable simple"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/simple.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+simple/CMakeFiles/simple.dir/build: simple/simple
+.PHONY : simple/CMakeFiles/simple.dir/build
+
+simple/CMakeFiles/simple.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && $(CMAKE_COMMAND) -P CMakeFiles/simple.dir/cmake_clean.cmake
+.PHONY : simple/CMakeFiles/simple.dir/clean
+
+simple/CMakeFiles/simple.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/simple /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : simple/CMakeFiles/simple.dir/depend
+
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/cmake_clean.cmake b/examples/sycl/simple/CMakeFiles/simple.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..554207f3bf60e
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/simple.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/simple.dir/simple.o"
+  "CMakeFiles/simple.dir/simple.o.d"
+  "simple"
+  "simple.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/simple.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.make b/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.make
new file mode 100644
index 0000000000000..5703239cd22ca
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for simple.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.ts b/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..3712ddf6ecefc
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for simple.
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/depend.make b/examples/sycl/simple/CMakeFiles/simple.dir/depend.make
new file mode 100644
index 0000000000000..1857af881f09a
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/simple.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for simple.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/flags.make b/examples/sycl/simple/CMakeFiles/simple.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/simple.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/link.txt b/examples/sycl/simple/CMakeFiles/simple.dir/link.txt
new file mode 100644
index 0000000000000..c5460183e6dca
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/simple.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/simple.dir/simple.o -o simple  -lcommon -lllama 
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/progress.make b/examples/sycl/simple/CMakeFiles/simple.dir/progress.make
new file mode 100644
index 0000000000000..80ac6c176b991
--- /dev/null
+++ b/examples/sycl/simple/CMakeFiles/simple.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 46
+CMAKE_PROGRESS_2 = 47
+
diff --git a/examples/sycl/simple/Makefile b/examples/sycl/simple/Makefile
new file mode 100644
index 0000000000000..d22ab5997f36e
--- /dev/null
+++ b/examples/sycl/simple/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+simple/CMakeFiles/simple.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/CMakeFiles/simple.dir/rule
+.PHONY : simple/CMakeFiles/simple.dir/rule
+
+# Convenience name for target.
+simple: simple/CMakeFiles/simple.dir/rule
+.PHONY : simple
+
+# fast build rule for target.
+simple/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/build
+.PHONY : simple/fast
+
+# target to build an object file
+simple.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/simple.o
+.PHONY : simple.o
+
+# target to preprocess a source file
+simple.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/simple.i
+.PHONY : simple.i
+
+# target to generate assembly for a file
+simple.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/simple.s
+.PHONY : simple.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... simple"
+	@echo "... simple.o"
+	@echo "... simple.i"
+	@echo "... simple.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/simple/cmake_install.cmake b/examples/sycl/simple/cmake_install.cmake
new file mode 100644
index 0000000000000..22ab3eaa27cb4
--- /dev/null
+++ b/examples/sycl/simple/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/simple
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/simple")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/speculative/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/speculative/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/speculative/CMakeFiles/progress.marks b/examples/sycl/speculative/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake b/examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..9e7f0cddf574b
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/speculative/speculative.cpp" "speculative/CMakeFiles/speculative.dir/speculative.o" "gcc" "speculative/CMakeFiles/speculative.dir/speculative.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/build.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/build.make
new file mode 100644
index 0000000000000..64c40f740346c
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/speculative.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include speculative/CMakeFiles/speculative.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include speculative/CMakeFiles/speculative.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include speculative/CMakeFiles/speculative.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include speculative/CMakeFiles/speculative.dir/flags.make
+
+speculative/CMakeFiles/speculative.dir/speculative.o: speculative/CMakeFiles/speculative.dir/flags.make
+speculative/CMakeFiles/speculative.dir/speculative.o: ../speculative/speculative.cpp
+speculative/CMakeFiles/speculative.dir/speculative.o: speculative/CMakeFiles/speculative.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object speculative/CMakeFiles/speculative.dir/speculative.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT speculative/CMakeFiles/speculative.dir/speculative.o -MF CMakeFiles/speculative.dir/speculative.o.d -o CMakeFiles/speculative.dir/speculative.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/speculative/speculative.cpp
+
+speculative/CMakeFiles/speculative.dir/speculative.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/speculative.dir/speculative.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/speculative/speculative.cpp > CMakeFiles/speculative.dir/speculative.i
+
+speculative/CMakeFiles/speculative.dir/speculative.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/speculative.dir/speculative.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/speculative/speculative.cpp -o CMakeFiles/speculative.dir/speculative.s
+
+# Object files for target speculative
+speculative_OBJECTS = \
+"CMakeFiles/speculative.dir/speculative.o"
+
+# External object files for target speculative
+speculative_EXTERNAL_OBJECTS =
+
+speculative/speculative: speculative/CMakeFiles/speculative.dir/speculative.o
+speculative/speculative: speculative/CMakeFiles/speculative.dir/build.make
+speculative/speculative: speculative/CMakeFiles/speculative.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable speculative"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/speculative.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+speculative/CMakeFiles/speculative.dir/build: speculative/speculative
+.PHONY : speculative/CMakeFiles/speculative.dir/build
+
+speculative/CMakeFiles/speculative.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && $(CMAKE_COMMAND) -P CMakeFiles/speculative.dir/cmake_clean.cmake
+.PHONY : speculative/CMakeFiles/speculative.dir/clean
+
+speculative/CMakeFiles/speculative.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/speculative /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : speculative/CMakeFiles/speculative.dir/depend
+
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/cmake_clean.cmake b/examples/sycl/speculative/CMakeFiles/speculative.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..68845ff09127a
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/speculative.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/speculative.dir/speculative.o"
+  "CMakeFiles/speculative.dir/speculative.o.d"
+  "speculative"
+  "speculative.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/speculative.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.make
new file mode 100644
index 0000000000000..b7621539416da
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for speculative.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.ts b/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..2afc705eca269
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for speculative.
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/depend.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/depend.make
new file mode 100644
index 0000000000000..6da57c2baa79f
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/speculative.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for speculative.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/flags.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/speculative.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/link.txt b/examples/sycl/speculative/CMakeFiles/speculative.dir/link.txt
new file mode 100644
index 0000000000000..9cfb674c94f2a
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/speculative.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/speculative.dir/speculative.o -o speculative  -lcommon -lllama 
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/progress.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/progress.make
new file mode 100644
index 0000000000000..961e1cad8a491
--- /dev/null
+++ b/examples/sycl/speculative/CMakeFiles/speculative.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 48
+CMAKE_PROGRESS_2 = 49
+
diff --git a/examples/sycl/speculative/Makefile b/examples/sycl/speculative/Makefile
new file mode 100644
index 0000000000000..713de48f104ff
--- /dev/null
+++ b/examples/sycl/speculative/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+speculative/CMakeFiles/speculative.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/CMakeFiles/speculative.dir/rule
+.PHONY : speculative/CMakeFiles/speculative.dir/rule
+
+# Convenience name for target.
+speculative: speculative/CMakeFiles/speculative.dir/rule
+.PHONY : speculative
+
+# fast build rule for target.
+speculative/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/build
+.PHONY : speculative/fast
+
+# target to build an object file
+speculative.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/speculative.o
+.PHONY : speculative.o
+
+# target to preprocess a source file
+speculative.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/speculative.i
+.PHONY : speculative.i
+
+# target to generate assembly for a file
+speculative.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/speculative.s
+.PHONY : speculative.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... speculative"
+	@echo "... speculative.o"
+	@echo "... speculative.i"
+	@echo "... speculative.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/speculative/cmake_install.cmake b/examples/sycl/speculative/cmake_install.cmake
new file mode 100644
index 0000000000000..7c9f489081d36
--- /dev/null
+++ b/examples/sycl/speculative/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/speculative
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/speculative")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/sycl/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..f86d13476a2fa
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/ls-sycl-device.cpp" "sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o" "gcc" "sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/build.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/build.make
new file mode 100644
index 0000000000000..9a8988ea8affd
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include sycl/CMakeFiles/ls-sycl-device.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include sycl/CMakeFiles/ls-sycl-device.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include sycl/CMakeFiles/ls-sycl-device.dir/flags.make
+
+sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o: sycl/CMakeFiles/ls-sycl-device.dir/flags.make
+sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o: ls-sycl-device.cpp
+sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o: sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o -MF CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o.d -o CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/ls-sycl-device.cpp
+
+sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/ls-sycl-device.dir/ls-sycl-device.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/ls-sycl-device.cpp > CMakeFiles/ls-sycl-device.dir/ls-sycl-device.i
+
+sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/ls-sycl-device.dir/ls-sycl-device.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/ls-sycl-device.cpp -o CMakeFiles/ls-sycl-device.dir/ls-sycl-device.s
+
+# Object files for target ls-sycl-device
+ls__sycl__device_OBJECTS = \
+"CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o"
+
+# External object files for target ls-sycl-device
+ls__sycl__device_EXTERNAL_OBJECTS =
+
+sycl/ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o
+sycl/ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/build.make
+sycl/ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable ls-sycl-device"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/ls-sycl-device.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+sycl/CMakeFiles/ls-sycl-device.dir/build: sycl/ls-sycl-device
+.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/build
+
+sycl/CMakeFiles/ls-sycl-device.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && $(CMAKE_COMMAND) -P CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake
+.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/clean
+
+sycl/CMakeFiles/ls-sycl-device.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/depend
+
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..f7bcb5646f683
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o"
+  "CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o.d"
+  "ls-sycl-device"
+  "ls-sycl-device.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/ls-sycl-device.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make
new file mode 100644
index 0000000000000..eda69ffb9384d
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for ls-sycl-device.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..c5f1ce2e44e53
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for ls-sycl-device.
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/depend.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/depend.make
new file mode 100644
index 0000000000000..911368e45b466
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for ls-sycl-device.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/flags.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/flags.make
new file mode 100644
index 0000000000000..931e8c441e8bd
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS =  -std=c++17 -Wno-narrowing -O3 -fsycl -I./ -I//2024.0/include
+
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/link.txt b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/link.txt
new file mode 100644
index 0000000000000..44973a40da0f0
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx  -std=c++17 -Wno-narrowing -O3 -fsycl -rdynamic CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o -o ls-sycl-device  -lcommon -lllama 
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/progress.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/progress.make
new file mode 100644
index 0000000000000..e1615c182fcdc
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 32
+CMAKE_PROGRESS_2 = 33
+
diff --git a/examples/sycl/sycl/CMakeFiles/progress.marks b/examples/sycl/sycl/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/sycl/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/sycl/Makefile b/examples/sycl/sycl/Makefile
new file mode 100644
index 0000000000000..b6a09fb7f350b
--- /dev/null
+++ b/examples/sycl/sycl/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+sycl/CMakeFiles/ls-sycl-device.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/CMakeFiles/ls-sycl-device.dir/rule
+.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/rule
+
+# Convenience name for target.
+ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/rule
+.PHONY : ls-sycl-device
+
+# fast build rule for target.
+ls-sycl-device/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/build
+.PHONY : ls-sycl-device/fast
+
+# target to build an object file
+ls-sycl-device.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o
+.PHONY : ls-sycl-device.o
+
+# target to preprocess a source file
+ls-sycl-device.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.i
+.PHONY : ls-sycl-device.i
+
+# target to generate assembly for a file
+ls-sycl-device.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.s
+.PHONY : ls-sycl-device.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... ls-sycl-device"
+	@echo "... ls-sycl-device.o"
+	@echo "... ls-sycl-device.i"
+	@echo "... ls-sycl-device.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/sycl/cmake_install.cmake b/examples/sycl/sycl/cmake_install.cmake
new file mode 100644
index 0000000000000..17c549fd9d29c
--- /dev/null
+++ b/examples/sycl/sycl/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/ls-sycl-device")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/tokenize/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/tokenize/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/tokenize/CMakeFiles/progress.marks b/examples/sycl/tokenize/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..f887be24f00a7
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/tokenize/tokenize.cpp" "tokenize/CMakeFiles/tokenize.dir/tokenize.o" "gcc" "tokenize/CMakeFiles/tokenize.dir/tokenize.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/build.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/build.make
new file mode 100644
index 0000000000000..1164bb382d6e2
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include tokenize/CMakeFiles/tokenize.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include tokenize/CMakeFiles/tokenize.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include tokenize/CMakeFiles/tokenize.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include tokenize/CMakeFiles/tokenize.dir/flags.make
+
+tokenize/CMakeFiles/tokenize.dir/tokenize.o: tokenize/CMakeFiles/tokenize.dir/flags.make
+tokenize/CMakeFiles/tokenize.dir/tokenize.o: ../tokenize/tokenize.cpp
+tokenize/CMakeFiles/tokenize.dir/tokenize.o: tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object tokenize/CMakeFiles/tokenize.dir/tokenize.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT tokenize/CMakeFiles/tokenize.dir/tokenize.o -MF CMakeFiles/tokenize.dir/tokenize.o.d -o CMakeFiles/tokenize.dir/tokenize.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize/tokenize.cpp
+
+tokenize/CMakeFiles/tokenize.dir/tokenize.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/tokenize.dir/tokenize.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize/tokenize.cpp > CMakeFiles/tokenize.dir/tokenize.i
+
+tokenize/CMakeFiles/tokenize.dir/tokenize.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/tokenize.dir/tokenize.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize/tokenize.cpp -o CMakeFiles/tokenize.dir/tokenize.s
+
+# Object files for target tokenize
+tokenize_OBJECTS = \
+"CMakeFiles/tokenize.dir/tokenize.o"
+
+# External object files for target tokenize
+tokenize_EXTERNAL_OBJECTS =
+
+tokenize/tokenize: tokenize/CMakeFiles/tokenize.dir/tokenize.o
+tokenize/tokenize: tokenize/CMakeFiles/tokenize.dir/build.make
+tokenize/tokenize: tokenize/CMakeFiles/tokenize.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable tokenize"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/tokenize.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+tokenize/CMakeFiles/tokenize.dir/build: tokenize/tokenize
+.PHONY : tokenize/CMakeFiles/tokenize.dir/build
+
+tokenize/CMakeFiles/tokenize.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && $(CMAKE_COMMAND) -P CMakeFiles/tokenize.dir/cmake_clean.cmake
+.PHONY : tokenize/CMakeFiles/tokenize.dir/clean
+
+tokenize/CMakeFiles/tokenize.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : tokenize/CMakeFiles/tokenize.dir/depend
+
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/cmake_clean.cmake b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..1f3463088133d
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/tokenize.dir/tokenize.o"
+  "CMakeFiles/tokenize.dir/tokenize.o.d"
+  "tokenize"
+  "tokenize.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/tokenize.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.make
new file mode 100644
index 0000000000000..13f791178b843
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for tokenize.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..0900c89c764cc
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for tokenize.
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/depend.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/depend.make
new file mode 100644
index 0000000000000..e1f613484cfb7
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for tokenize.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/flags.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/link.txt b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/link.txt
new file mode 100644
index 0000000000000..a0037536c35a1
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/tokenize.dir/tokenize.o -o tokenize  -lcommon -lllama 
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/progress.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/progress.make
new file mode 100644
index 0000000000000..dbabe1f503005
--- /dev/null
+++ b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 50
+CMAKE_PROGRESS_2 = 51
+
diff --git a/examples/sycl/tokenize/Makefile b/examples/sycl/tokenize/Makefile
new file mode 100644
index 0000000000000..19d5b91c6bd62
--- /dev/null
+++ b/examples/sycl/tokenize/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+tokenize/CMakeFiles/tokenize.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/CMakeFiles/tokenize.dir/rule
+.PHONY : tokenize/CMakeFiles/tokenize.dir/rule
+
+# Convenience name for target.
+tokenize: tokenize/CMakeFiles/tokenize.dir/rule
+.PHONY : tokenize
+
+# fast build rule for target.
+tokenize/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/build
+.PHONY : tokenize/fast
+
+# target to build an object file
+tokenize.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/tokenize.o
+.PHONY : tokenize.o
+
+# target to preprocess a source file
+tokenize.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/tokenize.i
+.PHONY : tokenize.i
+
+# target to generate assembly for a file
+tokenize.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/tokenize.s
+.PHONY : tokenize.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... tokenize"
+	@echo "... tokenize.o"
+	@echo "... tokenize.i"
+	@echo "... tokenize.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/tokenize/cmake_install.cmake b/examples/sycl/tokenize/cmake_install.cmake
new file mode 100644
index 0000000000000..a3c2dfb59a705
--- /dev/null
+++ b/examples/sycl/tokenize/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/tokenize")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize")
+    endif()
+  endif()
+endif()
+
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake
new file mode 100644
index 0000000000000..5c593a47325e1
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake
@@ -0,0 +1,16 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Relative path conversion top directories.
+set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
+set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
+
+# Force unix paths in dependencies.
+set(CMAKE_FORCE_UNIX_PATHS 1)
+
+
+# The C and CXX include file regular expressions for this directory.
+set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
+set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
+set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
+set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/progress.marks b/examples/sycl/train-text-from-scratch/CMakeFiles/progress.marks
new file mode 100644
index 0000000000000..0cfbf08886fca
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/progress.marks
@@ -0,0 +1 @@
+2
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake
new file mode 100644
index 0000000000000..b6611153346d0
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake
@@ -0,0 +1,19 @@
+
+# Consider dependencies only in project.
+set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
+
+# The set of languages for which implicit dependencies are needed:
+set(CMAKE_DEPENDS_LANGUAGES
+  )
+
+# The set of dependency files which are needed:
+set(CMAKE_DEPENDS_DEPENDENCY_FILES
+  "/home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch/train-text-from-scratch.cpp" "train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o" "gcc" "train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o.d"
+  )
+
+# Targets to which this target links.
+set(CMAKE_TARGET_LINKED_INFO_FILES
+  )
+
+# Fortran module output directory.
+set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make
new file mode 100644
index 0000000000000..5c57f36d5bf74
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make
@@ -0,0 +1,110 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Delete rule output on recipe failure.
+.DELETE_ON_ERROR:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+# Include any dependencies generated for this target.
+include train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make
+# Include any dependencies generated by the compiler for this target.
+include train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make
+
+# Include the progress variables for this target.
+include train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make
+
+# Include the compile flags for this target's objects.
+include train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
+
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o: ../train-text-from-scratch/train-text-from-scratch.cpp
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o -MF CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o.d -o CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch/train-text-from-scratch.cpp
+
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.i: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.i"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch/train-text-from-scratch.cpp > CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.i
+
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.s: cmake_force
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.s"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch/train-text-from-scratch.cpp -o CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.s
+
+# Object files for target train-text-from-scratch
+train__text__from__scratch_OBJECTS = \
+"CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o"
+
+# External object files for target train-text-from-scratch
+train__text__from__scratch_EXTERNAL_OBJECTS =
+
+train-text-from-scratch/train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o
+train-text-from-scratch/train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make
+train-text-from-scratch/train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable train-text-from-scratch"
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/train-text-from-scratch.dir/link.txt --verbose=$(VERBOSE)
+
+# Rule to build all files generated by this target.
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build: train-text-from-scratch/train-text-from-scratch
+.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build
+
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && $(CMAKE_COMMAND) -P CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake
+.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean
+
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake --color=$(COLOR)
+.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend
+
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake
new file mode 100644
index 0000000000000..05bfb17e3ae10
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake
@@ -0,0 +1,11 @@
+file(REMOVE_RECURSE
+  "CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o"
+  "CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o.d"
+  "train-text-from-scratch"
+  "train-text-from-scratch.pdb"
+)
+
+# Per-language clean rules from dependency scanning.
+foreach(lang CXX)
+  include(CMakeFiles/train-text-from-scratch.dir/cmake_clean_${lang}.cmake OPTIONAL)
+endforeach()
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make
new file mode 100644
index 0000000000000..e9531edfa6d92
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make
@@ -0,0 +1,2 @@
+# Empty compiler generated dependencies file for train-text-from-scratch.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts
new file mode 100644
index 0000000000000..27df3e1de02f5
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts
@@ -0,0 +1,2 @@
+# CMAKE generated file: DO NOT EDIT!
+# Timestamp file for compiler generated dependencies management for train-text-from-scratch.
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make
new file mode 100644
index 0000000000000..4aa3cbd93ed4d
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make
@@ -0,0 +1,2 @@
+# Empty dependencies file for train-text-from-scratch.
+# This may be replaced when dependencies are built.
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
new file mode 100644
index 0000000000000..6bddd3d69b220
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
@@ -0,0 +1,10 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
+CXX_DEFINES = 
+
+CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
+
+CXX_FLAGS = 
+
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt
new file mode 100644
index 0000000000000..e6e1f8bf105b5
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt
@@ -0,0 +1 @@
+/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o -o train-text-from-scratch  -lcommon -lllama 
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make
new file mode 100644
index 0000000000000..3b1f03a283674
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make
@@ -0,0 +1,3 @@
+CMAKE_PROGRESS_1 = 52
+CMAKE_PROGRESS_2 = 53
+
diff --git a/examples/sycl/train-text-from-scratch/Makefile b/examples/sycl/train-text-from-scratch/Makefile
new file mode 100644
index 0000000000000..6b6a274f953ab
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/Makefile
@@ -0,0 +1,222 @@
+# CMAKE generated file: DO NOT EDIT!
+# Generated by "Unix Makefiles" Generator, CMake Version 3.22
+
+# Default target executed when no arguments are given to make.
+default_target: all
+.PHONY : default_target
+
+# Allow only one "make -f Makefile2" at a time, but pass parallelism.
+.NOTPARALLEL:
+
+#=============================================================================
+# Special targets provided by cmake.
+
+# Disable implicit rules so canonical targets will work.
+.SUFFIXES:
+
+# Disable VCS-based implicit rules.
+% : %,v
+
+# Disable VCS-based implicit rules.
+% : RCS/%
+
+# Disable VCS-based implicit rules.
+% : RCS/%,v
+
+# Disable VCS-based implicit rules.
+% : SCCS/s.%
+
+# Disable VCS-based implicit rules.
+% : s.%
+
+.SUFFIXES: .hpux_make_needs_suffix_list
+
+# Command-line flag to silence nested $(MAKE).
+$(VERBOSE)MAKESILENT = -s
+
+#Suppress display of executed commands.
+$(VERBOSE).SILENT:
+
+# A target that is always out of date.
+cmake_force:
+.PHONY : cmake_force
+
+#=============================================================================
+# Set environment variables for the build.
+
+# The shell in which to execute make rules.
+SHELL = /bin/sh
+
+# The CMake executable.
+CMAKE_COMMAND = /usr/bin/cmake
+
+# The command to remove a file.
+RM = /usr/bin/cmake -E rm -f
+
+# Escaping for special characters.
+EQUALS = =
+
+# The top-level source directory on which CMake was run.
+CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
+
+# The top-level build directory on which CMake was run.
+CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
+
+#=============================================================================
+# Targets provided globally by CMake.
+
+# Special rule for the target edit_cache
+edit_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
+	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
+.PHONY : edit_cache
+
+# Special rule for the target edit_cache
+edit_cache/fast: edit_cache
+.PHONY : edit_cache/fast
+
+# Special rule for the target rebuild_cache
+rebuild_cache:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
+	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
+.PHONY : rebuild_cache
+
+# Special rule for the target rebuild_cache
+rebuild_cache/fast: rebuild_cache
+.PHONY : rebuild_cache/fast
+
+# Special rule for the target list_install_components
+list_install_components:
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
+.PHONY : list_install_components
+
+# Special rule for the target list_install_components
+list_install_components/fast: list_install_components
+.PHONY : list_install_components/fast
+
+# Special rule for the target install
+install: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install
+
+# Special rule for the target install
+install/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
+	/usr/bin/cmake -P cmake_install.cmake
+.PHONY : install/fast
+
+# Special rule for the target install/local
+install/local: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local
+
+# Special rule for the target install/local
+install/local/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
+	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
+.PHONY : install/local/fast
+
+# Special rule for the target install/strip
+install/strip: preinstall
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip
+
+# Special rule for the target install/strip
+install/strip/fast: preinstall/fast
+	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
+	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
+.PHONY : install/strip/fast
+
+# The main all target
+all: cmake_check_build_system
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch//CMakeFiles/progress.marks
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/all
+	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
+.PHONY : all
+
+# The main clean target
+clean:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/clean
+.PHONY : clean
+
+# The main clean target
+clean/fast: clean
+.PHONY : clean/fast
+
+# Prepare targets for installation.
+preinstall: all
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/preinstall
+.PHONY : preinstall
+
+# Prepare targets for installation.
+preinstall/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/preinstall
+.PHONY : preinstall/fast
+
+# clear depends
+depend:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
+.PHONY : depend
+
+# Convenience name for target.
+train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
+.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
+
+# Convenience name for target.
+train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
+.PHONY : train-text-from-scratch
+
+# fast build rule for target.
+train-text-from-scratch/fast:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build
+.PHONY : train-text-from-scratch/fast
+
+# target to build an object file
+train-text-from-scratch.o:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o
+.PHONY : train-text-from-scratch.o
+
+# target to preprocess a source file
+train-text-from-scratch.i:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.i
+.PHONY : train-text-from-scratch.i
+
+# target to generate assembly for a file
+train-text-from-scratch.s:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.s
+.PHONY : train-text-from-scratch.s
+
+# Help Target
+help:
+	@echo "The following are some of the valid targets for this Makefile:"
+	@echo "... all (the default if no target is provided)"
+	@echo "... clean"
+	@echo "... depend"
+	@echo "... edit_cache"
+	@echo "... install"
+	@echo "... install/local"
+	@echo "... install/strip"
+	@echo "... list_install_components"
+	@echo "... rebuild_cache"
+	@echo "... train-text-from-scratch"
+	@echo "... train-text-from-scratch.o"
+	@echo "... train-text-from-scratch.i"
+	@echo "... train-text-from-scratch.s"
+.PHONY : help
+
+
+
+#=============================================================================
+# Special targets to cleanup operation of make.
+
+# Special rule to run CMake to check the build system integrity.
+# No rule that depends on this can have commands that come from listfiles
+# because they might be regenerated.
+cmake_check_build_system:
+	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
+.PHONY : cmake_check_build_system
+
diff --git a/examples/sycl/train-text-from-scratch/cmake_install.cmake b/examples/sycl/train-text-from-scratch/cmake_install.cmake
new file mode 100644
index 0000000000000..ea9a267286952
--- /dev/null
+++ b/examples/sycl/train-text-from-scratch/cmake_install.cmake
@@ -0,0 +1,60 @@
+# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch
+
+# Set the install prefix
+if(NOT DEFINED CMAKE_INSTALL_PREFIX)
+  set(CMAKE_INSTALL_PREFIX "/usr/local")
+endif()
+string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
+
+# Set the install configuration name.
+if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
+  if(BUILD_TYPE)
+    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
+           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
+  else()
+    set(CMAKE_INSTALL_CONFIG_NAME "")
+  endif()
+  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
+endif()
+
+# Set the component getting installed.
+if(NOT CMAKE_INSTALL_COMPONENT)
+  if(COMPONENT)
+    message(STATUS "Install component: \"${COMPONENT}\"")
+    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
+  else()
+    set(CMAKE_INSTALL_COMPONENT)
+  endif()
+endif()
+
+# Install shared libraries without execute permission?
+if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
+  set(CMAKE_INSTALL_SO_NO_EXE "1")
+endif()
+
+# Is this installation the result of a crosscompile?
+if(NOT DEFINED CMAKE_CROSSCOMPILING)
+  set(CMAKE_CROSSCOMPILING "FALSE")
+endif()
+
+# Set default install directory permissions.
+if(NOT DEFINED CMAKE_OBJDUMP)
+  set(CMAKE_OBJDUMP "/usr/bin/objdump")
+endif()
+
+if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch")
+    file(RPATH_CHECK
+         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch"
+         RPATH "")
+  endif()
+  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/train-text-from-scratch")
+  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch" AND
+     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch")
+    if(CMAKE_INSTALL_DO_STRIP)
+      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch")
+    endif()
+  endif()
+endif()
+

From 7350fd48ef281a4a062656d4657d0317d758efff Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 16:03:38 +0800
Subject: [PATCH 27/90] add ls-sycl-device, rm unused files

---
 examples/sycl/CMakeCache.txt                  |  369 -----
 .../CMakeFiles/3.22.1/CMakeCCompiler.cmake    |   72 -
 .../CMakeFiles/3.22.1/CMakeCXXCompiler.cmake  |   83 --
 .../sycl/CMakeFiles/3.22.1/CMakeSystem.cmake  |   15 -
 .../3.22.1/CompilerIdC/CMakeCCompilerId.c     |  803 -----------
 .../sycl/CMakeFiles/3.22.1/CompilerIdC/a.out  |  Bin 16256 -> 0 bytes
 .../CompilerIdCXX/CMakeCXXCompilerId.cpp      |  791 -----------
 .../CMakeFiles/3.22.1/CompilerIdCXX/a.out     |  Bin 16264 -> 0 bytes
 .../CMakeDirectoryInformation.cmake           |   16 -
 examples/sycl/CMakeFiles/Makefile.cmake       |  130 --
 examples/sycl/CMakeFiles/Makefile2            | 1239 -----------------
 examples/sycl/CMakeFiles/Progress/1           |    1 -
 examples/sycl/CMakeFiles/Progress/count.txt   |    1 -
 .../sycl/CMakeFiles/TargetDirectories.txt     |  183 ---
 examples/sycl/CMakeFiles/cmake.check_cache    |    1 -
 examples/sycl/CMakeFiles/progress.marks       |    1 -
 examples/sycl/CMakeLists.txt                  |    2 +-
 examples/sycl/Makefile                        |  567 --------
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../baby-llama.dir/DependInfo.cmake           |   19 -
 .../CMakeFiles/baby-llama.dir/build.make      |  110 --
 .../baby-llama.dir/cmake_clean.cmake          |   11 -
 .../baby-llama.dir/compiler_depend.make       |    2 -
 .../baby-llama.dir/compiler_depend.ts         |    2 -
 .../CMakeFiles/baby-llama.dir/depend.make     |    2 -
 .../CMakeFiles/baby-llama.dir/flags.make      |   10 -
 .../CMakeFiles/baby-llama.dir/link.txt        |    1 -
 .../CMakeFiles/baby-llama.dir/progress.make   |    3 -
 .../sycl/baby-llama/CMakeFiles/progress.marks |    1 -
 examples/sycl/baby-llama/Makefile             |  222 ---
 examples/sycl/baby-llama/cmake_install.cmake  |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../batched-bench.dir/DependInfo.cmake        |   19 -
 .../CMakeFiles/batched-bench.dir/build.make   |  110 --
 .../batched-bench.dir/cmake_clean.cmake       |   11 -
 .../batched-bench.dir/compiler_depend.make    |    2 -
 .../batched-bench.dir/compiler_depend.ts      |    2 -
 .../CMakeFiles/batched-bench.dir/depend.make  |    2 -
 .../CMakeFiles/batched-bench.dir/flags.make   |   10 -
 .../CMakeFiles/batched-bench.dir/link.txt     |    1 -
 .../batched-bench.dir/progress.make           |    3 -
 .../batched-bench/CMakeFiles/progress.marks   |    1 -
 examples/sycl/batched-bench/Makefile          |  222 ---
 .../sycl/batched-bench/cmake_install.cmake    |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/batched.dir/DependInfo.cmake   |   19 -
 .../batched/CMakeFiles/batched.dir/build.make |  110 --
 .../CMakeFiles/batched.dir/cmake_clean.cmake  |   11 -
 .../batched.dir/compiler_depend.make          |    2 -
 .../CMakeFiles/batched.dir/compiler_depend.ts |    2 -
 .../CMakeFiles/batched.dir/depend.make        |    2 -
 .../batched/CMakeFiles/batched.dir/flags.make |   10 -
 .../batched/CMakeFiles/batched.dir/link.txt   |    1 -
 .../CMakeFiles/batched.dir/progress.make      |    3 -
 .../sycl/batched/CMakeFiles/progress.marks    |    1 -
 examples/sycl/batched/Makefile                |  222 ---
 examples/sycl/batched/cmake_install.cmake     |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../beam-search.dir/DependInfo.cmake          |   19 -
 .../CMakeFiles/beam-search.dir/build.make     |  110 --
 .../beam-search.dir/cmake_clean.cmake         |   11 -
 .../beam-search.dir/compiler_depend.make      |    2 -
 .../beam-search.dir/compiler_depend.ts        |    2 -
 .../CMakeFiles/beam-search.dir/depend.make    |    2 -
 .../CMakeFiles/beam-search.dir/flags.make     |   10 -
 .../CMakeFiles/beam-search.dir/link.txt       |    1 -
 .../CMakeFiles/beam-search.dir/progress.make  |    3 -
 .../beam-search/CMakeFiles/progress.marks     |    1 -
 examples/sycl/beam-search/Makefile            |  222 ---
 examples/sycl/beam-search/cmake_install.cmake |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/benchmark.dir/DependInfo.cmake |   19 -
 .../CMakeFiles/benchmark.dir/build.make       |  110 --
 .../benchmark.dir/cmake_clean.cmake           |   11 -
 .../benchmark.dir/compiler_depend.make        |    2 -
 .../benchmark.dir/compiler_depend.ts          |    2 -
 .../CMakeFiles/benchmark.dir/depend.make      |    2 -
 .../CMakeFiles/benchmark.dir/flags.make       |   10 -
 .../CMakeFiles/benchmark.dir/link.txt         |    1 -
 .../CMakeFiles/benchmark.dir/progress.make    |    3 -
 .../sycl/benchmark/CMakeFiles/progress.marks  |    1 -
 examples/sycl/benchmark/Makefile              |  222 ---
 examples/sycl/benchmark/cmake_install.cmake   |   60 -
 examples/sycl/cmake_install.cmake             |   84 --
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../DependInfo.cmake                          |   19 -
 .../convert-llama2c-to-ggml.dir/build.make    |  110 --
 .../cmake_clean.cmake                         |   11 -
 .../compiler_depend.make                      |    2 -
 .../compiler_depend.ts                        |    2 -
 .../convert-llama2c-to-ggml.dir/depend.make   |    2 -
 .../convert-llama2c-to-ggml.dir/flags.make    |   10 -
 .../convert-llama2c-to-ggml.dir/link.txt      |    1 -
 .../convert-llama2c-to-ggml.dir/progress.make |    3 -
 .../CMakeFiles/progress.marks                 |    1 -
 .../sycl/convert-llama2c-to-ggml/Makefile     |  222 ---
 .../cmake_install.cmake                       |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/embedding.dir/DependInfo.cmake |   19 -
 .../CMakeFiles/embedding.dir/build.make       |  110 --
 .../embedding.dir/cmake_clean.cmake           |   11 -
 .../embedding.dir/compiler_depend.make        |    2 -
 .../embedding.dir/compiler_depend.ts          |    2 -
 .../CMakeFiles/embedding.dir/depend.make      |    2 -
 .../CMakeFiles/embedding.dir/flags.make       |   10 -
 .../CMakeFiles/embedding.dir/link.txt         |    1 -
 .../CMakeFiles/embedding.dir/progress.make    |    3 -
 .../sycl/embedding/CMakeFiles/progress.marks  |    1 -
 examples/sycl/embedding/Makefile              |  222 ---
 examples/sycl/embedding/cmake_install.cmake   |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../export-lora.dir/DependInfo.cmake          |   19 -
 .../CMakeFiles/export-lora.dir/build.make     |  110 --
 .../export-lora.dir/cmake_clean.cmake         |   11 -
 .../export-lora.dir/compiler_depend.make      |    2 -
 .../export-lora.dir/compiler_depend.ts        |    2 -
 .../CMakeFiles/export-lora.dir/depend.make    |    2 -
 .../CMakeFiles/export-lora.dir/flags.make     |   10 -
 .../CMakeFiles/export-lora.dir/link.txt       |    1 -
 .../CMakeFiles/export-lora.dir/progress.make  |    3 -
 .../export-lora/CMakeFiles/progress.marks     |    1 -
 examples/sycl/export-lora/Makefile            |  222 ---
 examples/sycl/export-lora/cmake_install.cmake |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/finetune.dir/DependInfo.cmake  |   19 -
 .../CMakeFiles/finetune.dir/build.make        |  110 --
 .../CMakeFiles/finetune.dir/cmake_clean.cmake |   11 -
 .../finetune.dir/compiler_depend.make         |    2 -
 .../finetune.dir/compiler_depend.ts           |    2 -
 .../CMakeFiles/finetune.dir/depend.make       |    2 -
 .../CMakeFiles/finetune.dir/flags.make        |   10 -
 .../finetune/CMakeFiles/finetune.dir/link.txt |    1 -
 .../CMakeFiles/finetune.dir/progress.make     |    3 -
 .../sycl/finetune/CMakeFiles/progress.marks   |    1 -
 examples/sycl/finetune/Makefile               |  222 ---
 examples/sycl/finetune/cmake_install.cmake    |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/infill.dir/DependInfo.cmake    |   19 -
 .../infill/CMakeFiles/infill.dir/build.make   |  110 --
 .../CMakeFiles/infill.dir/cmake_clean.cmake   |   11 -
 .../infill.dir/compiler_depend.make           |    2 -
 .../CMakeFiles/infill.dir/compiler_depend.ts  |    2 -
 .../infill/CMakeFiles/infill.dir/depend.make  |    2 -
 .../infill/CMakeFiles/infill.dir/flags.make   |   10 -
 .../infill/CMakeFiles/infill.dir/link.txt     |    1 -
 .../CMakeFiles/infill.dir/progress.make       |    3 -
 .../sycl/infill/CMakeFiles/progress.marks     |    1 -
 examples/sycl/infill/Makefile                 |  222 ---
 examples/sycl/infill/cmake_install.cmake      |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../llama-bench.dir/DependInfo.cmake          |   19 -
 .../CMakeFiles/llama-bench.dir/build.make     |  110 --
 .../llama-bench.dir/cmake_clean.cmake         |   11 -
 .../llama-bench.dir/compiler_depend.make      |    2 -
 .../llama-bench.dir/compiler_depend.ts        |    2 -
 .../CMakeFiles/llama-bench.dir/depend.make    |    2 -
 .../CMakeFiles/llama-bench.dir/flags.make     |   10 -
 .../CMakeFiles/llama-bench.dir/link.txt       |    1 -
 .../CMakeFiles/llama-bench.dir/progress.make  |    3 -
 .../llama-bench/CMakeFiles/progress.marks     |    1 -
 examples/sycl/llama-bench/Makefile            |  222 ---
 examples/sycl/llama-bench/cmake_install.cmake |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/llava-cli.dir/DependInfo.cmake |   19 -
 .../llava/CMakeFiles/llava-cli.dir/build.make |  114 --
 .../llava-cli.dir/cmake_clean.cmake           |   11 -
 .../llava-cli.dir/compiler_depend.make        |    2 -
 .../llava-cli.dir/compiler_depend.ts          |    2 -
 .../CMakeFiles/llava-cli.dir/depend.make      |    2 -
 .../llava/CMakeFiles/llava-cli.dir/flags.make |   10 -
 .../llava/CMakeFiles/llava-cli.dir/link.txt   |    1 -
 .../CMakeFiles/llava-cli.dir/progress.make    |    3 -
 .../CMakeFiles/llava.dir/DependInfo.cmake     |   20 -
 .../llava/CMakeFiles/llava.dir/build.make     |  116 --
 .../CMakeFiles/llava.dir/cmake_clean.cmake    |   11 -
 .../CMakeFiles/llava.dir/compiler_depend.make |    2 -
 .../CMakeFiles/llava.dir/compiler_depend.ts   |    2 -
 .../llava/CMakeFiles/llava.dir/depend.make    |    2 -
 .../llava/CMakeFiles/llava.dir/flags.make     |   10 -
 .../llava/CMakeFiles/llava.dir/progress.make  |    3 -
 .../llava_static.dir/DependInfo.cmake         |   18 -
 .../CMakeFiles/llava_static.dir/build.make    |   99 --
 .../llava_static.dir/cmake_clean.cmake        |    9 -
 .../llava_static.dir/cmake_clean_target.cmake |    3 -
 .../llava_static.dir/compiler_depend.make     |    2 -
 .../llava_static.dir/compiler_depend.ts       |    2 -
 .../CMakeFiles/llava_static.dir/depend.make   |    2 -
 .../CMakeFiles/llava_static.dir/flags.make    |   10 -
 .../CMakeFiles/llava_static.dir/link.txt      |    2 -
 .../CMakeFiles/llava_static.dir/progress.make |    2 -
 examples/sycl/llava/CMakeFiles/progress.marks |    1 -
 examples/sycl/llava/Makefile                  |  288 ----
 examples/sycl/llava/cmake_install.cmake       |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/lookahead.dir/DependInfo.cmake |   19 -
 .../CMakeFiles/lookahead.dir/build.make       |  110 --
 .../lookahead.dir/cmake_clean.cmake           |   11 -
 .../lookahead.dir/compiler_depend.make        |    2 -
 .../lookahead.dir/compiler_depend.ts          |    2 -
 .../CMakeFiles/lookahead.dir/depend.make      |    2 -
 .../CMakeFiles/lookahead.dir/flags.make       |   10 -
 .../CMakeFiles/lookahead.dir/link.txt         |    1 -
 .../CMakeFiles/lookahead.dir/progress.make    |    3 -
 .../sycl/lookahead/CMakeFiles/progress.marks  |    1 -
 examples/sycl/lookahead/Makefile              |  222 ---
 examples/sycl/lookahead/cmake_install.cmake   |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/lookup.dir/DependInfo.cmake    |   19 -
 .../lookup/CMakeFiles/lookup.dir/build.make   |  110 --
 .../CMakeFiles/lookup.dir/cmake_clean.cmake   |   11 -
 .../lookup.dir/compiler_depend.make           |    2 -
 .../CMakeFiles/lookup.dir/compiler_depend.ts  |    2 -
 .../lookup/CMakeFiles/lookup.dir/depend.make  |    2 -
 .../lookup/CMakeFiles/lookup.dir/flags.make   |   10 -
 .../lookup/CMakeFiles/lookup.dir/link.txt     |    1 -
 .../CMakeFiles/lookup.dir/progress.make       |    3 -
 .../sycl/lookup/CMakeFiles/progress.marks     |    1 -
 examples/sycl/lookup/Makefile                 |  222 ---
 examples/sycl/lookup/cmake_install.cmake      |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../main/CMakeFiles/main.dir/DependInfo.cmake |   19 -
 .../sycl/main/CMakeFiles/main.dir/build.make  |  110 --
 .../CMakeFiles/main.dir/cmake_clean.cmake     |   11 -
 .../CMakeFiles/main.dir/compiler_depend.make  |    2 -
 .../CMakeFiles/main.dir/compiler_depend.ts    |    2 -
 .../sycl/main/CMakeFiles/main.dir/depend.make |    2 -
 .../sycl/main/CMakeFiles/main.dir/flags.make  |   10 -
 .../sycl/main/CMakeFiles/main.dir/link.txt    |    1 -
 .../main/CMakeFiles/main.dir/progress.make    |    3 -
 examples/sycl/main/CMakeFiles/progress.marks  |    1 -
 examples/sycl/main/Makefile                   |  222 ---
 examples/sycl/main/cmake_install.cmake        |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/parallel.dir/DependInfo.cmake  |   19 -
 .../CMakeFiles/parallel.dir/build.make        |  110 --
 .../CMakeFiles/parallel.dir/cmake_clean.cmake |   11 -
 .../parallel.dir/compiler_depend.make         |    2 -
 .../parallel.dir/compiler_depend.ts           |    2 -
 .../CMakeFiles/parallel.dir/depend.make       |    2 -
 .../CMakeFiles/parallel.dir/flags.make        |   10 -
 .../parallel/CMakeFiles/parallel.dir/link.txt |    1 -
 .../CMakeFiles/parallel.dir/progress.make     |    3 -
 .../sycl/parallel/CMakeFiles/progress.marks   |    1 -
 examples/sycl/parallel/Makefile               |  222 ---
 examples/sycl/parallel/cmake_install.cmake    |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../perplexity.dir/DependInfo.cmake           |   19 -
 .../CMakeFiles/perplexity.dir/build.make      |  110 --
 .../perplexity.dir/cmake_clean.cmake          |   11 -
 .../perplexity.dir/compiler_depend.make       |    2 -
 .../perplexity.dir/compiler_depend.ts         |    2 -
 .../CMakeFiles/perplexity.dir/depend.make     |    2 -
 .../CMakeFiles/perplexity.dir/flags.make      |   10 -
 .../CMakeFiles/perplexity.dir/link.txt        |    1 -
 .../CMakeFiles/perplexity.dir/progress.make   |    3 -
 .../sycl/perplexity/CMakeFiles/progress.marks |    1 -
 examples/sycl/perplexity/Makefile             |  222 ---
 examples/sycl/perplexity/cmake_install.cmake  |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../quantize-stats/CMakeFiles/progress.marks  |    1 -
 .../quantize-stats.dir/DependInfo.cmake       |   19 -
 .../CMakeFiles/quantize-stats.dir/build.make  |  110 --
 .../quantize-stats.dir/cmake_clean.cmake      |   11 -
 .../quantize-stats.dir/compiler_depend.make   |    2 -
 .../quantize-stats.dir/compiler_depend.ts     |    2 -
 .../CMakeFiles/quantize-stats.dir/depend.make |    2 -
 .../CMakeFiles/quantize-stats.dir/flags.make  |   10 -
 .../CMakeFiles/quantize-stats.dir/link.txt    |    1 -
 .../quantize-stats.dir/progress.make          |    3 -
 examples/sycl/quantize-stats/Makefile         |  222 ---
 .../sycl/quantize-stats/cmake_install.cmake   |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../sycl/quantize/CMakeFiles/progress.marks   |    1 -
 .../CMakeFiles/quantize.dir/DependInfo.cmake  |   19 -
 .../CMakeFiles/quantize.dir/build.make        |  110 --
 .../CMakeFiles/quantize.dir/cmake_clean.cmake |   11 -
 .../quantize.dir/compiler_depend.make         |    2 -
 .../quantize.dir/compiler_depend.ts           |    2 -
 .../CMakeFiles/quantize.dir/depend.make       |    2 -
 .../CMakeFiles/quantize.dir/flags.make        |   10 -
 .../quantize/CMakeFiles/quantize.dir/link.txt |    1 -
 .../CMakeFiles/quantize.dir/progress.make     |    3 -
 examples/sycl/quantize/Makefile               |  222 ---
 examples/sycl/quantize/cmake_install.cmake    |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../save-load-state/CMakeFiles/progress.marks |    1 -
 .../save-load-state.dir/DependInfo.cmake      |   19 -
 .../CMakeFiles/save-load-state.dir/build.make |  110 --
 .../save-load-state.dir/cmake_clean.cmake     |   11 -
 .../save-load-state.dir/compiler_depend.make  |    2 -
 .../save-load-state.dir/compiler_depend.ts    |    2 -
 .../save-load-state.dir/depend.make           |    2 -
 .../CMakeFiles/save-load-state.dir/flags.make |   10 -
 .../CMakeFiles/save-load-state.dir/link.txt   |    1 -
 .../save-load-state.dir/progress.make         |    3 -
 examples/sycl/save-load-state/Makefile        |  222 ---
 .../sycl/save-load-state/cmake_install.cmake  |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../sycl/simple/CMakeFiles/progress.marks     |    1 -
 .../CMakeFiles/simple.dir/DependInfo.cmake    |   19 -
 .../simple/CMakeFiles/simple.dir/build.make   |  110 --
 .../CMakeFiles/simple.dir/cmake_clean.cmake   |   11 -
 .../simple.dir/compiler_depend.make           |    2 -
 .../CMakeFiles/simple.dir/compiler_depend.ts  |    2 -
 .../simple/CMakeFiles/simple.dir/depend.make  |    2 -
 .../simple/CMakeFiles/simple.dir/flags.make   |   10 -
 .../simple/CMakeFiles/simple.dir/link.txt     |    1 -
 .../CMakeFiles/simple.dir/progress.make       |    3 -
 examples/sycl/simple/Makefile                 |  222 ---
 examples/sycl/simple/cmake_install.cmake      |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../speculative/CMakeFiles/progress.marks     |    1 -
 .../speculative.dir/DependInfo.cmake          |   19 -
 .../CMakeFiles/speculative.dir/build.make     |  110 --
 .../speculative.dir/cmake_clean.cmake         |   11 -
 .../speculative.dir/compiler_depend.make      |    2 -
 .../speculative.dir/compiler_depend.ts        |    2 -
 .../CMakeFiles/speculative.dir/depend.make    |    2 -
 .../CMakeFiles/speculative.dir/flags.make     |   10 -
 .../CMakeFiles/speculative.dir/link.txt       |    1 -
 .../CMakeFiles/speculative.dir/progress.make  |    3 -
 examples/sycl/speculative/Makefile            |  222 ---
 examples/sycl/speculative/cmake_install.cmake |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../ls-sycl-device.dir/DependInfo.cmake       |   19 -
 .../CMakeFiles/ls-sycl-device.dir/build.make  |  110 --
 .../ls-sycl-device.dir/cmake_clean.cmake      |   11 -
 .../ls-sycl-device.dir/compiler_depend.make   |    2 -
 .../ls-sycl-device.dir/compiler_depend.ts     |    2 -
 .../CMakeFiles/ls-sycl-device.dir/depend.make |    2 -
 .../CMakeFiles/ls-sycl-device.dir/flags.make  |   10 -
 .../CMakeFiles/ls-sycl-device.dir/link.txt    |    1 -
 .../ls-sycl-device.dir/progress.make          |    3 -
 examples/sycl/sycl/CMakeFiles/progress.marks  |    1 -
 examples/sycl/sycl/Makefile                   |  222 ---
 examples/sycl/sycl/cmake_install.cmake        |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../sycl/tokenize/CMakeFiles/progress.marks   |    1 -
 .../CMakeFiles/tokenize.dir/DependInfo.cmake  |   19 -
 .../CMakeFiles/tokenize.dir/build.make        |  110 --
 .../CMakeFiles/tokenize.dir/cmake_clean.cmake |   11 -
 .../tokenize.dir/compiler_depend.make         |    2 -
 .../tokenize.dir/compiler_depend.ts           |    2 -
 .../CMakeFiles/tokenize.dir/depend.make       |    2 -
 .../CMakeFiles/tokenize.dir/flags.make        |   10 -
 .../tokenize/CMakeFiles/tokenize.dir/link.txt |    1 -
 .../CMakeFiles/tokenize.dir/progress.make     |    3 -
 examples/sycl/tokenize/Makefile               |  222 ---
 examples/sycl/tokenize/cmake_install.cmake    |   60 -
 .../CMakeDirectoryInformation.cmake           |   16 -
 .../CMakeFiles/progress.marks                 |    1 -
 .../DependInfo.cmake                          |   19 -
 .../train-text-from-scratch.dir/build.make    |  110 --
 .../cmake_clean.cmake                         |   11 -
 .../compiler_depend.make                      |    2 -
 .../compiler_depend.ts                        |    2 -
 .../train-text-from-scratch.dir/depend.make   |    2 -
 .../train-text-from-scratch.dir/flags.make    |   10 -
 .../train-text-from-scratch.dir/link.txt      |    1 -
 .../train-text-from-scratch.dir/progress.make |    3 -
 .../sycl/train-text-from-scratch/Makefile     |  222 ---
 .../cmake_install.cmake                       |   60 -
 362 files changed, 1 insertion(+), 16217 deletions(-)
 delete mode 100644 examples/sycl/CMakeCache.txt
 delete mode 100644 examples/sycl/CMakeFiles/3.22.1/CMakeCCompiler.cmake
 delete mode 100644 examples/sycl/CMakeFiles/3.22.1/CMakeCXXCompiler.cmake
 delete mode 100644 examples/sycl/CMakeFiles/3.22.1/CMakeSystem.cmake
 delete mode 100644 examples/sycl/CMakeFiles/3.22.1/CompilerIdC/CMakeCCompilerId.c
 delete mode 100755 examples/sycl/CMakeFiles/3.22.1/CompilerIdC/a.out
 delete mode 100644 examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/CMakeCXXCompilerId.cpp
 delete mode 100755 examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/a.out
 delete mode 100644 examples/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/CMakeFiles/Makefile.cmake
 delete mode 100644 examples/sycl/CMakeFiles/Makefile2
 delete mode 100644 examples/sycl/CMakeFiles/Progress/1
 delete mode 100644 examples/sycl/CMakeFiles/Progress/count.txt
 delete mode 100644 examples/sycl/CMakeFiles/TargetDirectories.txt
 delete mode 100644 examples/sycl/CMakeFiles/cmake.check_cache
 delete mode 100644 examples/sycl/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/Makefile
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/build.make
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/depend.make
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/flags.make
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/link.txt
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/progress.make
 delete mode 100644 examples/sycl/baby-llama/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/baby-llama/Makefile
 delete mode 100644 examples/sycl/baby-llama/cmake_install.cmake
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/build.make
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/depend.make
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/flags.make
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/link.txt
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/progress.make
 delete mode 100644 examples/sycl/batched-bench/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/batched-bench/Makefile
 delete mode 100644 examples/sycl/batched-bench/cmake_install.cmake
 delete mode 100644 examples/sycl/batched/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/build.make
 delete mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.make
 delete mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/depend.make
 delete mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/flags.make
 delete mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/link.txt
 delete mode 100644 examples/sycl/batched/CMakeFiles/batched.dir/progress.make
 delete mode 100644 examples/sycl/batched/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/batched/Makefile
 delete mode 100644 examples/sycl/batched/cmake_install.cmake
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/build.make
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.make
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/depend.make
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/flags.make
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/link.txt
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/beam-search.dir/progress.make
 delete mode 100644 examples/sycl/beam-search/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/beam-search/Makefile
 delete mode 100644 examples/sycl/beam-search/cmake_install.cmake
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/build.make
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.make
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/depend.make
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/flags.make
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/link.txt
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/benchmark.dir/progress.make
 delete mode 100644 examples/sycl/benchmark/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/benchmark/Makefile
 delete mode 100644 examples/sycl/benchmark/cmake_install.cmake
 delete mode 100644 examples/sycl/cmake_install.cmake
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/Makefile
 delete mode 100644 examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake
 delete mode 100644 examples/sycl/embedding/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/build.make
 delete mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.make
 delete mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/depend.make
 delete mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/flags.make
 delete mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/link.txt
 delete mode 100644 examples/sycl/embedding/CMakeFiles/embedding.dir/progress.make
 delete mode 100644 examples/sycl/embedding/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/embedding/Makefile
 delete mode 100644 examples/sycl/embedding/cmake_install.cmake
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/build.make
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.make
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/depend.make
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/flags.make
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/link.txt
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/export-lora.dir/progress.make
 delete mode 100644 examples/sycl/export-lora/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/export-lora/Makefile
 delete mode 100644 examples/sycl/export-lora/cmake_install.cmake
 delete mode 100644 examples/sycl/finetune/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/build.make
 delete mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.make
 delete mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/depend.make
 delete mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/flags.make
 delete mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/link.txt
 delete mode 100644 examples/sycl/finetune/CMakeFiles/finetune.dir/progress.make
 delete mode 100644 examples/sycl/finetune/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/finetune/Makefile
 delete mode 100644 examples/sycl/finetune/cmake_install.cmake
 delete mode 100644 examples/sycl/infill/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/build.make
 delete mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.make
 delete mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/depend.make
 delete mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/flags.make
 delete mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/link.txt
 delete mode 100644 examples/sycl/infill/CMakeFiles/infill.dir/progress.make
 delete mode 100644 examples/sycl/infill/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/infill/Makefile
 delete mode 100644 examples/sycl/infill/cmake_install.cmake
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/build.make
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/depend.make
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/flags.make
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/link.txt
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/progress.make
 delete mode 100644 examples/sycl/llama-bench/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/llama-bench/Makefile
 delete mode 100644 examples/sycl/llama-bench/cmake_install.cmake
 delete mode 100644 examples/sycl/llava/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/build.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/depend.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/flags.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/link.txt
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava-cli.dir/progress.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/build.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/depend.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/flags.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava.dir/progress.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/build.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean_target.cmake
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/depend.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/flags.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/link.txt
 delete mode 100644 examples/sycl/llava/CMakeFiles/llava_static.dir/progress.make
 delete mode 100644 examples/sycl/llava/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/llava/Makefile
 delete mode 100644 examples/sycl/llava/cmake_install.cmake
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/build.make
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.make
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/depend.make
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/flags.make
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/link.txt
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/lookahead.dir/progress.make
 delete mode 100644 examples/sycl/lookahead/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/lookahead/Makefile
 delete mode 100644 examples/sycl/lookahead/cmake_install.cmake
 delete mode 100644 examples/sycl/lookup/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/build.make
 delete mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.make
 delete mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/depend.make
 delete mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/flags.make
 delete mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/link.txt
 delete mode 100644 examples/sycl/lookup/CMakeFiles/lookup.dir/progress.make
 delete mode 100644 examples/sycl/lookup/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/lookup/Makefile
 delete mode 100644 examples/sycl/lookup/cmake_install.cmake
 delete mode 100644 examples/sycl/main/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/main/CMakeFiles/main.dir/build.make
 delete mode 100644 examples/sycl/main/CMakeFiles/main.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/main/CMakeFiles/main.dir/compiler_depend.make
 delete mode 100644 examples/sycl/main/CMakeFiles/main.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/main/CMakeFiles/main.dir/depend.make
 delete mode 100644 examples/sycl/main/CMakeFiles/main.dir/flags.make
 delete mode 100644 examples/sycl/main/CMakeFiles/main.dir/link.txt
 delete mode 100644 examples/sycl/main/CMakeFiles/main.dir/progress.make
 delete mode 100644 examples/sycl/main/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/main/Makefile
 delete mode 100644 examples/sycl/main/cmake_install.cmake
 delete mode 100644 examples/sycl/parallel/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/build.make
 delete mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.make
 delete mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/depend.make
 delete mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/flags.make
 delete mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/link.txt
 delete mode 100644 examples/sycl/parallel/CMakeFiles/parallel.dir/progress.make
 delete mode 100644 examples/sycl/parallel/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/parallel/Makefile
 delete mode 100644 examples/sycl/parallel/cmake_install.cmake
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/build.make
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.make
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/depend.make
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/flags.make
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/link.txt
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/perplexity.dir/progress.make
 delete mode 100644 examples/sycl/perplexity/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/perplexity/Makefile
 delete mode 100644 examples/sycl/perplexity/cmake_install.cmake
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/build.make
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/depend.make
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/link.txt
 delete mode 100644 examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/progress.make
 delete mode 100644 examples/sycl/quantize-stats/Makefile
 delete mode 100644 examples/sycl/quantize-stats/cmake_install.cmake
 delete mode 100644 examples/sycl/quantize/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/quantize/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/build.make
 delete mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.make
 delete mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/depend.make
 delete mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/flags.make
 delete mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/link.txt
 delete mode 100644 examples/sycl/quantize/CMakeFiles/quantize.dir/progress.make
 delete mode 100644 examples/sycl/quantize/Makefile
 delete mode 100644 examples/sycl/quantize/cmake_install.cmake
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/build.make
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/depend.make
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/flags.make
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/link.txt
 delete mode 100644 examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/progress.make
 delete mode 100644 examples/sycl/save-load-state/Makefile
 delete mode 100644 examples/sycl/save-load-state/cmake_install.cmake
 delete mode 100644 examples/sycl/simple/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/simple/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/build.make
 delete mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.make
 delete mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/depend.make
 delete mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/flags.make
 delete mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/link.txt
 delete mode 100644 examples/sycl/simple/CMakeFiles/simple.dir/progress.make
 delete mode 100644 examples/sycl/simple/Makefile
 delete mode 100644 examples/sycl/simple/cmake_install.cmake
 delete mode 100644 examples/sycl/speculative/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/speculative/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/build.make
 delete mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.make
 delete mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/depend.make
 delete mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/flags.make
 delete mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/link.txt
 delete mode 100644 examples/sycl/speculative/CMakeFiles/speculative.dir/progress.make
 delete mode 100644 examples/sycl/speculative/Makefile
 delete mode 100644 examples/sycl/speculative/cmake_install.cmake
 delete mode 100644 examples/sycl/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/build.make
 delete mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make
 delete mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/depend.make
 delete mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/flags.make
 delete mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/link.txt
 delete mode 100644 examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/progress.make
 delete mode 100644 examples/sycl/sycl/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/sycl/Makefile
 delete mode 100644 examples/sycl/sycl/cmake_install.cmake
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/build.make
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.make
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/depend.make
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/flags.make
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/link.txt
 delete mode 100644 examples/sycl/tokenize/CMakeFiles/tokenize.dir/progress.make
 delete mode 100644 examples/sycl/tokenize/Makefile
 delete mode 100644 examples/sycl/tokenize/cmake_install.cmake
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/progress.marks
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt
 delete mode 100644 examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make
 delete mode 100644 examples/sycl/train-text-from-scratch/Makefile
 delete mode 100644 examples/sycl/train-text-from-scratch/cmake_install.cmake

diff --git a/examples/sycl/CMakeCache.txt b/examples/sycl/CMakeCache.txt
deleted file mode 100644
index c8998b4ac8f15..0000000000000
--- a/examples/sycl/CMakeCache.txt
+++ /dev/null
@@ -1,369 +0,0 @@
-# This is the CMakeCache file.
-# For build in directory: /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-# It was generated by CMake: /usr/bin/cmake
-# You can edit this file to change values found and used by cmake.
-# If you do not want to change any of the values, simply exit the editor.
-# If you do want to change a value, simply edit, save, and exit the editor.
-# The syntax for the file is as follows:
-# KEY:TYPE=VALUE
-# KEY is the name of a variable in the cache.
-# TYPE is a hint to GUIs for the type of VALUE, DO NOT EDIT TYPE!.
-# VALUE is the current value for the KEY.
-
-########################
-# EXTERNAL cache entries
-########################
-
-//Path to a program.
-CMAKE_ADDR2LINE:FILEPATH=/usr/bin/addr2line
-
-//Path to a program.
-CMAKE_AR:FILEPATH=/usr/bin/ar
-
-//For backwards compatibility, what version of CMake commands and
-// syntax should this version of CMake try to support.
-CMAKE_BACKWARDS_COMPATIBILITY:STRING=2.4
-
-//Choose the type of build, options are: None Debug Release RelWithDebInfo
-// MinSizeRel ...
-CMAKE_BUILD_TYPE:STRING=
-
-//Enable/Disable color output during build.
-CMAKE_COLOR_MAKEFILE:BOOL=ON
-
-//CXX compiler
-CMAKE_CXX_COMPILER:STRING=/opt/intel/oneapi/compiler/2024.0/bin/icpx
-
-//Flags used by the CXX compiler during all build types.
-CMAKE_CXX_FLAGS:STRING=
-
-//Flags used by the CXX compiler during DEBUG builds.
-CMAKE_CXX_FLAGS_DEBUG:STRING=-g
-
-//Flags used by the CXX compiler during MINSIZEREL builds.
-CMAKE_CXX_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
-
-//Flags used by the CXX compiler during RELEASE builds.
-CMAKE_CXX_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
-
-//Flags used by the CXX compiler during RELWITHDEBINFO builds.
-CMAKE_CXX_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
-
-//C compiler
-CMAKE_C_COMPILER:STRING=/opt/intel/oneapi/compiler/2024.0/bin/icx
-
-//Flags used by the C compiler during all build types.
-CMAKE_C_FLAGS:STRING=
-
-//Flags used by the C compiler during DEBUG builds.
-CMAKE_C_FLAGS_DEBUG:STRING=-g
-
-//Flags used by the C compiler during MINSIZEREL builds.
-CMAKE_C_FLAGS_MINSIZEREL:STRING=-Os -DNDEBUG
-
-//Flags used by the C compiler during RELEASE builds.
-CMAKE_C_FLAGS_RELEASE:STRING=-O3 -DNDEBUG
-
-//Flags used by the C compiler during RELWITHDEBINFO builds.
-CMAKE_C_FLAGS_RELWITHDEBINFO:STRING=-O2 -g -DNDEBUG
-
-//Path to a program.
-CMAKE_DLLTOOL:FILEPATH=CMAKE_DLLTOOL-NOTFOUND
-
-//Flags used by the linker during all build types.
-CMAKE_EXE_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during DEBUG builds.
-CMAKE_EXE_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during MINSIZEREL builds.
-CMAKE_EXE_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during RELEASE builds.
-CMAKE_EXE_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during RELWITHDEBINFO builds.
-CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Enable/Disable output of compile commands during generation.
-CMAKE_EXPORT_COMPILE_COMMANDS:BOOL=
-
-//Install path prefix, prepended onto install directories.
-CMAKE_INSTALL_PREFIX:PATH=/usr/local
-
-//Path to a program.
-CMAKE_LINKER:FILEPATH=/usr/bin/ld
-
-//Path to a program.
-CMAKE_MAKE_PROGRAM:FILEPATH=/usr/bin/gmake
-
-//Flags used by the linker during the creation of modules during
-// all build types.
-CMAKE_MODULE_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during the creation of modules during
-// DEBUG builds.
-CMAKE_MODULE_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during the creation of modules during
-// MINSIZEREL builds.
-CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during the creation of modules during
-// RELEASE builds.
-CMAKE_MODULE_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during the creation of modules during
-// RELWITHDEBINFO builds.
-CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Path to a program.
-CMAKE_NM:FILEPATH=/usr/bin/nm
-
-//Path to a program.
-CMAKE_OBJCOPY:FILEPATH=/usr/bin/objcopy
-
-//Path to a program.
-CMAKE_OBJDUMP:FILEPATH=/usr/bin/objdump
-
-//Value Computed by CMake
-CMAKE_PROJECT_DESCRIPTION:STATIC=
-
-//Value Computed by CMake
-CMAKE_PROJECT_HOMEPAGE_URL:STATIC=
-
-//Value Computed by CMake
-CMAKE_PROJECT_NAME:STATIC=Project
-
-//Path to a program.
-CMAKE_RANLIB:FILEPATH=/usr/bin/ranlib
-
-//Path to a program.
-CMAKE_READELF:FILEPATH=/usr/bin/readelf
-
-//Flags used by the linker during the creation of shared libraries
-// during all build types.
-CMAKE_SHARED_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during DEBUG builds.
-CMAKE_SHARED_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during MINSIZEREL builds.
-CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during RELEASE builds.
-CMAKE_SHARED_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during the creation of shared libraries
-// during RELWITHDEBINFO builds.
-CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//If set, runtime paths are not added when installing shared libraries,
-// but are added when building.
-CMAKE_SKIP_INSTALL_RPATH:BOOL=NO
-
-//If set, runtime paths are not added when using shared libraries.
-CMAKE_SKIP_RPATH:BOOL=NO
-
-//Flags used by the linker during the creation of static libraries
-// during all build types.
-CMAKE_STATIC_LINKER_FLAGS:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during DEBUG builds.
-CMAKE_STATIC_LINKER_FLAGS_DEBUG:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during MINSIZEREL builds.
-CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during RELEASE builds.
-CMAKE_STATIC_LINKER_FLAGS_RELEASE:STRING=
-
-//Flags used by the linker during the creation of static libraries
-// during RELWITHDEBINFO builds.
-CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO:STRING=
-
-//Path to a program.
-CMAKE_STRIP:FILEPATH=/usr/bin/strip
-
-//If this value is on, makefiles will be generated without the
-// .SILENT directive, and all commands will be echoed to the console
-// during the make.  This is useful for debugging only. With Visual
-// Studio IDE projects all commands are done without /nologo.
-CMAKE_VERBOSE_MAKEFILE:BOOL=FALSE
-
-//Single output directory for building all executables.
-EXECUTABLE_OUTPUT_PATH:PATH=
-
-//Single output directory for building all libraries.
-LIBRARY_OUTPUT_PATH:PATH=
-
-//No help, variable specified on the command line.
-LLAMA_SYCL:UNINITIALIZED=ON
-
-//Value Computed by CMake
-Project_BINARY_DIR:STATIC=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-//Value Computed by CMake
-Project_IS_TOP_LEVEL:STATIC=ON
-
-//Value Computed by CMake
-Project_SOURCE_DIR:STATIC=/home/jianyuzh/ws/llama.cpp/develop/examples
-
-//Path to a program.
-XIAR:FILEPATH=/opt/intel/oneapi/compiler/2024.0/bin/xiar
-
-
-########################
-# INTERNAL cache entries
-########################
-
-//ADVANCED property for variable: CMAKE_ADDR2LINE
-CMAKE_ADDR2LINE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_AR
-CMAKE_AR-ADVANCED:INTERNAL=1
-//This is the directory where this CMakeCache.txt was created
-CMAKE_CACHEFILE_DIR:INTERNAL=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-//Major version of cmake used to create the current loaded cache
-CMAKE_CACHE_MAJOR_VERSION:INTERNAL=3
-//Minor version of cmake used to create the current loaded cache
-CMAKE_CACHE_MINOR_VERSION:INTERNAL=22
-//Patch version of cmake used to create the current loaded cache
-CMAKE_CACHE_PATCH_VERSION:INTERNAL=1
-//ADVANCED property for variable: CMAKE_COLOR_MAKEFILE
-CMAKE_COLOR_MAKEFILE-ADVANCED:INTERNAL=1
-//Path to CMake executable.
-CMAKE_COMMAND:INTERNAL=/usr/bin/cmake
-//Path to cpack program executable.
-CMAKE_CPACK_COMMAND:INTERNAL=/usr/bin/cpack
-//Path to ctest program executable.
-CMAKE_CTEST_COMMAND:INTERNAL=/usr/bin/ctest
-//ADVANCED property for variable: CMAKE_CXX_COMPILER
-CMAKE_CXX_COMPILER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS
-CMAKE_CXX_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_DEBUG
-CMAKE_CXX_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_MINSIZEREL
-CMAKE_CXX_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELEASE
-CMAKE_CXX_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_CXX_FLAGS_RELWITHDEBINFO
-CMAKE_CXX_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_COMPILER
-CMAKE_C_COMPILER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS
-CMAKE_C_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS_DEBUG
-CMAKE_C_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS_MINSIZEREL
-CMAKE_C_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS_RELEASE
-CMAKE_C_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_C_FLAGS_RELWITHDEBINFO
-CMAKE_C_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_DLLTOOL
-CMAKE_DLLTOOL-ADVANCED:INTERNAL=1
-//Executable file format
-CMAKE_EXECUTABLE_FORMAT:INTERNAL=ELF
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS
-CMAKE_EXE_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_DEBUG
-CMAKE_EXE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_MINSIZEREL
-CMAKE_EXE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELEASE
-CMAKE_EXE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_EXPORT_COMPILE_COMMANDS
-CMAKE_EXPORT_COMPILE_COMMANDS-ADVANCED:INTERNAL=1
-//Name of external makefile project generator.
-CMAKE_EXTRA_GENERATOR:INTERNAL=
-//Name of generator.
-CMAKE_GENERATOR:INTERNAL=Unix Makefiles
-//Generator instance identifier.
-CMAKE_GENERATOR_INSTANCE:INTERNAL=
-//Name of generator platform.
-CMAKE_GENERATOR_PLATFORM:INTERNAL=
-//Name of generator toolset.
-CMAKE_GENERATOR_TOOLSET:INTERNAL=
-//Test CMAKE_HAVE_LIBC_PTHREAD
-CMAKE_HAVE_LIBC_PTHREAD:INTERNAL=1
-//Have include pthread.h
-CMAKE_HAVE_PTHREAD_H:INTERNAL=1
-//Source directory with the top level CMakeLists.txt file for this
-// project
-CMAKE_HOME_DIRECTORY:INTERNAL=/home/jianyuzh/ws/llama.cpp/develop/examples
-//Install .so files without execute permission.
-CMAKE_INSTALL_SO_NO_EXE:INTERNAL=1
-//ADVANCED property for variable: CMAKE_LINKER
-CMAKE_LINKER-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MAKE_PROGRAM
-CMAKE_MAKE_PROGRAM-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS
-CMAKE_MODULE_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_DEBUG
-CMAKE_MODULE_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL
-CMAKE_MODULE_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELEASE
-CMAKE_MODULE_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_MODULE_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_NM
-CMAKE_NM-ADVANCED:INTERNAL=1
-//number of local generators
-CMAKE_NUMBER_OF_MAKEFILES:INTERNAL=26
-//ADVANCED property for variable: CMAKE_OBJCOPY
-CMAKE_OBJCOPY-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_OBJDUMP
-CMAKE_OBJDUMP-ADVANCED:INTERNAL=1
-//Platform information initialized
-CMAKE_PLATFORM_INFO_INITIALIZED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_RANLIB
-CMAKE_RANLIB-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_READELF
-CMAKE_READELF-ADVANCED:INTERNAL=1
-//Path to CMake installation.
-CMAKE_ROOT:INTERNAL=/usr/share/cmake-3.22
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS
-CMAKE_SHARED_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_DEBUG
-CMAKE_SHARED_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL
-CMAKE_SHARED_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELEASE
-CMAKE_SHARED_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SKIP_INSTALL_RPATH
-CMAKE_SKIP_INSTALL_RPATH-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_SKIP_RPATH
-CMAKE_SKIP_RPATH-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS
-CMAKE_STATIC_LINKER_FLAGS-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_DEBUG
-CMAKE_STATIC_LINKER_FLAGS_DEBUG-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL
-CMAKE_STATIC_LINKER_FLAGS_MINSIZEREL-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELEASE
-CMAKE_STATIC_LINKER_FLAGS_RELEASE-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO
-CMAKE_STATIC_LINKER_FLAGS_RELWITHDEBINFO-ADVANCED:INTERNAL=1
-//ADVANCED property for variable: CMAKE_STRIP
-CMAKE_STRIP-ADVANCED:INTERNAL=1
-//uname command
-CMAKE_UNAME:INTERNAL=/usr/bin/uname
-//ADVANCED property for variable: CMAKE_VERBOSE_MAKEFILE
-CMAKE_VERBOSE_MAKEFILE-ADVANCED:INTERNAL=1
-//Details about finding Threads
-FIND_PACKAGE_MESSAGE_DETAILS_Threads:INTERNAL=[TRUE][v()]
-//ADVANCED property for variable: XIAR
-XIAR-ADVANCED:INTERNAL=1
-
diff --git a/examples/sycl/CMakeFiles/3.22.1/CMakeCCompiler.cmake b/examples/sycl/CMakeFiles/3.22.1/CMakeCCompiler.cmake
deleted file mode 100644
index 0e4cc221817c5..0000000000000
--- a/examples/sycl/CMakeFiles/3.22.1/CMakeCCompiler.cmake
+++ /dev/null
@@ -1,72 +0,0 @@
-set(CMAKE_C_COMPILER "/opt/intel/oneapi/compiler/2024.0/bin/icx")
-set(CMAKE_C_COMPILER_ARG1 "")
-set(CMAKE_C_COMPILER_ID "IntelLLVM")
-set(CMAKE_C_COMPILER_VERSION "2024.0.0")
-set(CMAKE_C_COMPILER_VERSION_INTERNAL "")
-set(CMAKE_C_COMPILER_WRAPPER "")
-set(CMAKE_C_STANDARD_COMPUTED_DEFAULT "17")
-set(CMAKE_C_EXTENSIONS_COMPUTED_DEFAULT "ON")
-set(CMAKE_C_COMPILE_FEATURES "c_std_90;c_function_prototypes;c_std_99;c_restrict;c_variadic_macros;c_std_11;c_static_assert;c_std_17")
-set(CMAKE_C90_COMPILE_FEATURES "c_std_90;c_function_prototypes")
-set(CMAKE_C99_COMPILE_FEATURES "c_std_99;c_restrict;c_variadic_macros")
-set(CMAKE_C11_COMPILE_FEATURES "c_std_11;c_static_assert")
-set(CMAKE_C17_COMPILE_FEATURES "c_std_17")
-set(CMAKE_C23_COMPILE_FEATURES "")
-
-set(CMAKE_C_PLATFORM_ID "Linux")
-set(CMAKE_C_SIMULATE_ID "GNU")
-set(CMAKE_C_COMPILER_FRONTEND_VARIANT "GNU")
-set(CMAKE_C_SIMULATE_VERSION "4.2.1")
-
-
-
-
-set(CMAKE_AR "/usr/bin/ar")
-set(CMAKE_C_COMPILER_AR "")
-set(CMAKE_RANLIB "/usr/bin/ranlib")
-set(CMAKE_C_COMPILER_RANLIB "")
-set(CMAKE_LINKER "/usr/bin/ld")
-set(CMAKE_MT "")
-set(CMAKE_COMPILER_IS_GNUCC )
-set(CMAKE_C_COMPILER_LOADED 1)
-set(CMAKE_C_COMPILER_WORKS TRUE)
-set(CMAKE_C_ABI_COMPILED TRUE)
-
-set(CMAKE_C_COMPILER_ENV_VAR "CC")
-
-set(CMAKE_C_COMPILER_ID_RUN 1)
-set(CMAKE_C_SOURCE_FILE_EXTENSIONS c;m)
-set(CMAKE_C_IGNORE_EXTENSIONS h;H;o;O;obj;OBJ;def;DEF;rc;RC)
-set(CMAKE_C_LINKER_PREFERENCE 10)
-
-# Save compiler ABI information.
-set(CMAKE_C_SIZEOF_DATA_PTR "8")
-set(CMAKE_C_COMPILER_ABI "ELF")
-set(CMAKE_C_BYTE_ORDER "LITTLE_ENDIAN")
-set(CMAKE_C_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-
-if(CMAKE_C_SIZEOF_DATA_PTR)
-  set(CMAKE_SIZEOF_VOID_P "${CMAKE_C_SIZEOF_DATA_PTR}")
-endif()
-
-if(CMAKE_C_COMPILER_ABI)
-  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_C_COMPILER_ABI}")
-endif()
-
-if(CMAKE_C_LIBRARY_ARCHITECTURE)
-  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-endif()
-
-set(CMAKE_C_CL_SHOWINCLUDES_PREFIX "")
-if(CMAKE_C_CL_SHOWINCLUDES_PREFIX)
-  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_C_CL_SHOWINCLUDES_PREFIX}")
-endif()
-
-
-
-
-
-set(CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES "/opt/intel/oneapi/tbb/2021.11/include;/opt/intel/oneapi/mpi/2021.11/include;/opt/intel/oneapi/mkl/2024.0/include;/opt/intel/oneapi/ippcp/2021.9/include;/opt/intel/oneapi/ipp/2021.10/include;/opt/intel/oneapi/dpl/2022.3/include;/opt/intel/oneapi/dpcpp-ct/2024.0/include;/opt/intel/oneapi/dnnl/2024.0/include;/opt/intel/oneapi/dev-utilities/2024.0/include;/opt/intel/oneapi/dal/2024.0/include/dal;/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/include;/opt/intel/oneapi/ccl/2021.11/include;/opt/intel/oneapi/compiler/2024.0/opt/compiler/include;/opt/intel/oneapi/compiler/2024.0/lib/clang/17/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include")
-set(CMAKE_C_IMPLICIT_LINK_LIBRARIES "svml;irng;imf;m;gcc;gcc_s;irc;dl;gcc;gcc_s;c;gcc;gcc_s;irc_s")
-set(CMAKE_C_IMPLICIT_LINK_DIRECTORIES "/opt/intel/oneapi/compiler/2024.0/lib;/opt/intel/oneapi/compiler/2024.0/lib/clang/17/lib/x86_64-unknown-linux-gnu;/usr/lib/gcc/x86_64-linux-gnu/11;/usr/lib64;/lib/x86_64-linux-gnu;/lib64;/usr/lib/x86_64-linux-gnu;/usr/lib;/opt/intel/oneapi/compiler/2024.0/opt/compiler/lib;/lib;/opt/intel/oneapi/tbb/2021.11/lib/intel64/gcc4.8;/opt/intel/oneapi/mpi/2021.11/lib;/opt/intel/oneapi/mkl/2024.0/lib;/opt/intel/oneapi/ippcp/2021.9/lib;/opt/intel/oneapi/ipp/2021.10/lib;/opt/intel/oneapi/dpl/2022.3/lib;/opt/intel/oneapi/dnnl/2024.0/lib;/opt/intel/oneapi/dal/2024.0/lib;/opt/intel/oneapi/ccl/2021.11/lib")
-set(CMAKE_C_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/examples/sycl/CMakeFiles/3.22.1/CMakeCXXCompiler.cmake b/examples/sycl/CMakeFiles/3.22.1/CMakeCXXCompiler.cmake
deleted file mode 100644
index 37999919458bd..0000000000000
--- a/examples/sycl/CMakeFiles/3.22.1/CMakeCXXCompiler.cmake
+++ /dev/null
@@ -1,83 +0,0 @@
-set(CMAKE_CXX_COMPILER "/opt/intel/oneapi/compiler/2024.0/bin/icpx")
-set(CMAKE_CXX_COMPILER_ARG1 "")
-set(CMAKE_CXX_COMPILER_ID "IntelLLVM")
-set(CMAKE_CXX_COMPILER_VERSION "2024.0.0")
-set(CMAKE_CXX_COMPILER_VERSION_INTERNAL "")
-set(CMAKE_CXX_COMPILER_WRAPPER "")
-set(CMAKE_CXX_STANDARD_COMPUTED_DEFAULT "17")
-set(CMAKE_CXX_EXTENSIONS_COMPUTED_DEFAULT "ON")
-set(CMAKE_CXX_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters;cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates;cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates;cxx_std_17;cxx_std_20")
-set(CMAKE_CXX98_COMPILE_FEATURES "cxx_std_98;cxx_template_template_parameters")
-set(CMAKE_CXX11_COMPILE_FEATURES "cxx_std_11;cxx_alias_templates;cxx_alignas;cxx_alignof;cxx_attributes;cxx_auto_type;cxx_constexpr;cxx_decltype;cxx_decltype_incomplete_return_types;cxx_default_function_template_args;cxx_defaulted_functions;cxx_defaulted_move_initializers;cxx_delegating_constructors;cxx_deleted_functions;cxx_enum_forward_declarations;cxx_explicit_conversions;cxx_extended_friend_declarations;cxx_extern_templates;cxx_final;cxx_func_identifier;cxx_generalized_initializers;cxx_inheriting_constructors;cxx_inline_namespaces;cxx_lambdas;cxx_local_type_template_args;cxx_long_long_type;cxx_noexcept;cxx_nonstatic_member_init;cxx_nullptr;cxx_override;cxx_range_for;cxx_raw_string_literals;cxx_reference_qualified_functions;cxx_right_angle_brackets;cxx_rvalue_references;cxx_sizeof_member;cxx_static_assert;cxx_strong_enums;cxx_thread_local;cxx_trailing_return_types;cxx_unicode_literals;cxx_uniform_initialization;cxx_unrestricted_unions;cxx_user_literals;cxx_variadic_macros;cxx_variadic_templates")
-set(CMAKE_CXX14_COMPILE_FEATURES "cxx_std_14;cxx_aggregate_default_initializers;cxx_attribute_deprecated;cxx_binary_literals;cxx_contextual_conversions;cxx_decltype_auto;cxx_digit_separators;cxx_generic_lambdas;cxx_lambda_init_captures;cxx_relaxed_constexpr;cxx_return_type_deduction;cxx_variable_templates")
-set(CMAKE_CXX17_COMPILE_FEATURES "cxx_std_17")
-set(CMAKE_CXX20_COMPILE_FEATURES "cxx_std_20")
-set(CMAKE_CXX23_COMPILE_FEATURES "")
-
-set(CMAKE_CXX_PLATFORM_ID "Linux")
-set(CMAKE_CXX_SIMULATE_ID "GNU")
-set(CMAKE_CXX_COMPILER_FRONTEND_VARIANT "GNU")
-set(CMAKE_CXX_SIMULATE_VERSION "4.2.1")
-
-
-
-
-set(CMAKE_AR "/usr/bin/ar")
-set(CMAKE_CXX_COMPILER_AR "")
-set(CMAKE_RANLIB "/usr/bin/ranlib")
-set(CMAKE_CXX_COMPILER_RANLIB "")
-set(CMAKE_LINKER "/usr/bin/ld")
-set(CMAKE_MT "")
-set(CMAKE_COMPILER_IS_GNUCXX )
-set(CMAKE_CXX_COMPILER_LOADED 1)
-set(CMAKE_CXX_COMPILER_WORKS TRUE)
-set(CMAKE_CXX_ABI_COMPILED TRUE)
-
-set(CMAKE_CXX_COMPILER_ENV_VAR "CXX")
-
-set(CMAKE_CXX_COMPILER_ID_RUN 1)
-set(CMAKE_CXX_SOURCE_FILE_EXTENSIONS C;M;c++;cc;cpp;cxx;m;mm;mpp;CPP;ixx;cppm)
-set(CMAKE_CXX_IGNORE_EXTENSIONS inl;h;hpp;HPP;H;o;O;obj;OBJ;def;DEF;rc;RC)
-
-foreach (lang C OBJC OBJCXX)
-  if (CMAKE_${lang}_COMPILER_ID_RUN)
-    foreach(extension IN LISTS CMAKE_${lang}_SOURCE_FILE_EXTENSIONS)
-      list(REMOVE_ITEM CMAKE_CXX_SOURCE_FILE_EXTENSIONS ${extension})
-    endforeach()
-  endif()
-endforeach()
-
-set(CMAKE_CXX_LINKER_PREFERENCE 30)
-set(CMAKE_CXX_LINKER_PREFERENCE_PROPAGATES 1)
-
-# Save compiler ABI information.
-set(CMAKE_CXX_SIZEOF_DATA_PTR "8")
-set(CMAKE_CXX_COMPILER_ABI "ELF")
-set(CMAKE_CXX_BYTE_ORDER "LITTLE_ENDIAN")
-set(CMAKE_CXX_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-
-if(CMAKE_CXX_SIZEOF_DATA_PTR)
-  set(CMAKE_SIZEOF_VOID_P "${CMAKE_CXX_SIZEOF_DATA_PTR}")
-endif()
-
-if(CMAKE_CXX_COMPILER_ABI)
-  set(CMAKE_INTERNAL_PLATFORM_ABI "${CMAKE_CXX_COMPILER_ABI}")
-endif()
-
-if(CMAKE_CXX_LIBRARY_ARCHITECTURE)
-  set(CMAKE_LIBRARY_ARCHITECTURE "x86_64-linux-gnu")
-endif()
-
-set(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX "")
-if(CMAKE_CXX_CL_SHOWINCLUDES_PREFIX)
-  set(CMAKE_CL_SHOWINCLUDES_PREFIX "${CMAKE_CXX_CL_SHOWINCLUDES_PREFIX}")
-endif()
-
-
-
-
-
-set(CMAKE_CXX_IMPLICIT_INCLUDE_DIRECTORIES "/opt/intel/oneapi/tbb/2021.11/include;/opt/intel/oneapi/mpi/2021.11/include;/opt/intel/oneapi/mkl/2024.0/include;/opt/intel/oneapi/ippcp/2021.9/include;/opt/intel/oneapi/ipp/2021.10/include;/opt/intel/oneapi/dpl/2022.3/include;/opt/intel/oneapi/dpcpp-ct/2024.0/include;/opt/intel/oneapi/dnnl/2024.0/include;/opt/intel/oneapi/dev-utilities/2024.0/include;/opt/intel/oneapi/dal/2024.0/include/dal;/opt/intel/oneapi/compiler/2024.0/opt/oclfpga/include;/opt/intel/oneapi/ccl/2021.11/include;/opt/intel/oneapi/compiler/2024.0/opt/compiler/include;/usr/include/c++/11;/usr/include/x86_64-linux-gnu/c++/11;/usr/include/c++/11/backward;/opt/intel/oneapi/compiler/2024.0/lib/clang/17/include;/usr/local/include;/usr/include/x86_64-linux-gnu;/usr/include")
-set(CMAKE_CXX_IMPLICIT_LINK_LIBRARIES "svml;irng;stdc++;imf;m;gcc_s;gcc;irc;dl;gcc_s;gcc;c;gcc_s;gcc;irc_s")
-set(CMAKE_CXX_IMPLICIT_LINK_DIRECTORIES "/opt/intel/oneapi/compiler/2024.0/lib;/opt/intel/oneapi/compiler/2024.0/lib/clang/17/lib/x86_64-unknown-linux-gnu;/usr/lib/gcc/x86_64-linux-gnu/11;/usr/lib64;/lib/x86_64-linux-gnu;/lib64;/usr/lib/x86_64-linux-gnu;/usr/lib;/opt/intel/oneapi/compiler/2024.0/opt/compiler/lib;/lib;/opt/intel/oneapi/tbb/2021.11/lib/intel64/gcc4.8;/opt/intel/oneapi/mpi/2021.11/lib;/opt/intel/oneapi/mkl/2024.0/lib;/opt/intel/oneapi/ippcp/2021.9/lib;/opt/intel/oneapi/ipp/2021.10/lib;/opt/intel/oneapi/dpl/2022.3/lib;/opt/intel/oneapi/dnnl/2024.0/lib;/opt/intel/oneapi/dal/2024.0/lib;/opt/intel/oneapi/ccl/2021.11/lib")
-set(CMAKE_CXX_IMPLICIT_LINK_FRAMEWORK_DIRECTORIES "")
diff --git a/examples/sycl/CMakeFiles/3.22.1/CMakeSystem.cmake b/examples/sycl/CMakeFiles/3.22.1/CMakeSystem.cmake
deleted file mode 100644
index 21d50c58e038f..0000000000000
--- a/examples/sycl/CMakeFiles/3.22.1/CMakeSystem.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-set(CMAKE_HOST_SYSTEM "Linux-6.2.0-39-generic")
-set(CMAKE_HOST_SYSTEM_NAME "Linux")
-set(CMAKE_HOST_SYSTEM_VERSION "6.2.0-39-generic")
-set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64")
-
-
-
-set(CMAKE_SYSTEM "Linux-6.2.0-39-generic")
-set(CMAKE_SYSTEM_NAME "Linux")
-set(CMAKE_SYSTEM_VERSION "6.2.0-39-generic")
-set(CMAKE_SYSTEM_PROCESSOR "x86_64")
-
-set(CMAKE_CROSSCOMPILING "FALSE")
-
-set(CMAKE_SYSTEM_LOADED 1)
diff --git a/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/CMakeCCompilerId.c b/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/CMakeCCompilerId.c
deleted file mode 100644
index 41b99d7783c1e..0000000000000
--- a/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/CMakeCCompilerId.c
+++ /dev/null
@@ -1,803 +0,0 @@
-#ifdef __cplusplus
-# error "A C++ compiler has been selected for C."
-#endif
-
-#if defined(__18CXX)
-# define ID_VOID_MAIN
-#endif
-#if defined(__CLASSIC_C__)
-/* cv-qualifiers did not exist in K&R C */
-# define const
-# define volatile
-#endif
-
-#if !defined(__has_include)
-/* If the compiler does not have __has_include, pretend the answer is
-   always no.  */
-#  define __has_include(x) 0
-#endif
-
-
-/* Version number components: V=Version, R=Revision, P=Patch
-   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
-
-#if defined(__INTEL_COMPILER) || defined(__ICC)
-# define COMPILER_ID "Intel"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# if defined(__GNUC__)
-#  define SIMULATE_ID "GNU"
-# endif
-  /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
-     except that a few beta releases use the old format with V=2021.  */
-# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
-#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
-#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
-#  if defined(__INTEL_COMPILER_UPDATE)
-#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
-#  else
-#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
-#  endif
-# else
-#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
-#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
-   /* The third version component from --version is an update index,
-      but no macro is provided for it.  */
-#  define COMPILER_VERSION_PATCH DEC(0)
-# endif
-# if defined(__INTEL_COMPILER_BUILD_DATE)
-   /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
-#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
-# endif
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# if defined(__GNUC__)
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-# elif defined(__GNUG__)
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
-# endif
-# if defined(__GNUC_MINOR__)
-#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
-# define COMPILER_ID "IntelLLVM"
-#if defined(_MSC_VER)
-# define SIMULATE_ID "MSVC"
-#endif
-#if defined(__GNUC__)
-# define SIMULATE_ID "GNU"
-#endif
-/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
- * later.  Look for 6 digit vs. 8 digit version number to decide encoding.
- * VVVV is no smaller than the current year when a version is released.
- */
-#if __INTEL_LLVM_COMPILER < 1000000L
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER    % 10)
-#else
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER     % 100)
-#endif
-#if defined(_MSC_VER)
-  /* _MSC_VER = VVRR */
-# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-#endif
-#if defined(__GNUC__)
-# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-#elif defined(__GNUG__)
-# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
-#endif
-#if defined(__GNUC_MINOR__)
-# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-#endif
-#if defined(__GNUC_PATCHLEVEL__)
-# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-#endif
-
-#elif defined(__PATHCC__)
-# define COMPILER_ID "PathScale"
-# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
-# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
-# if defined(__PATHCC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
-# endif
-
-#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
-# define COMPILER_ID "Embarcadero"
-# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
-# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
-# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
-
-#elif defined(__BORLANDC__)
-# define COMPILER_ID "Borland"
-  /* __BORLANDC__ = 0xVRR */
-# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
-# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
-
-#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
-# define COMPILER_ID "Watcom"
-   /* __WATCOMC__ = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__WATCOMC__)
-# define COMPILER_ID "OpenWatcom"
-   /* __WATCOMC__ = VVRP + 1100 */
-# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__SUNPRO_C)
-# define COMPILER_ID "SunPro"
-# if __SUNPRO_C >= 0x5100
-   /* __SUNPRO_C = 0xVRRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>12)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xFF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_C    & 0xF)
-# else
-   /* __SUNPRO_CC = 0xVRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_C>>8)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_C>>4 & 0xF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_C    & 0xF)
-# endif
-
-#elif defined(__HP_cc)
-# define COMPILER_ID "HP"
-  /* __HP_cc = VVRRPP */
-# define COMPILER_VERSION_MAJOR DEC(__HP_cc/10000)
-# define COMPILER_VERSION_MINOR DEC(__HP_cc/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__HP_cc     % 100)
-
-#elif defined(__DECC)
-# define COMPILER_ID "Compaq"
-  /* __DECC_VER = VVRRTPPPP */
-# define COMPILER_VERSION_MAJOR DEC(__DECC_VER/10000000)
-# define COMPILER_VERSION_MINOR DEC(__DECC_VER/100000  % 100)
-# define COMPILER_VERSION_PATCH DEC(__DECC_VER         % 10000)
-
-#elif defined(__IBMC__) && defined(__COMPILER_VER__)
-# define COMPILER_ID "zOS"
-  /* __IBMC__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
-
-#elif defined(__ibmxl__) && defined(__clang__)
-# define COMPILER_ID "XLClang"
-# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
-# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
-# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
-# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
-
-
-#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ >= 800
-# define COMPILER_ID "XL"
-  /* __IBMC__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
-
-#elif defined(__IBMC__) && !defined(__COMPILER_VER__) && __IBMC__ < 800
-# define COMPILER_ID "VisualAge"
-  /* __IBMC__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMC__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMC__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMC__    % 10)
-
-#elif defined(__NVCOMPILER)
-# define COMPILER_ID "NVHPC"
-# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
-# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
-# if defined(__NVCOMPILER_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
-# endif
-
-#elif defined(__PGI)
-# define COMPILER_ID "PGI"
-# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
-# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
-# if defined(__PGIC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
-# endif
-
-#elif defined(_CRAYC)
-# define COMPILER_ID "Cray"
-# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
-# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
-
-#elif defined(__TI_COMPILER_VERSION__)
-# define COMPILER_ID "TI"
-  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
-# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
-# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
-# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
-
-#elif defined(__CLANG_FUJITSU)
-# define COMPILER_ID "FujitsuClang"
-# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
-# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
-# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
-# define COMPILER_VERSION_INTERNAL_STR __clang_version__
-
-
-#elif defined(__FUJITSU)
-# define COMPILER_ID "Fujitsu"
-# if defined(__FCC_version__)
-#   define COMPILER_VERSION __FCC_version__
-# elif defined(__FCC_major__)
-#   define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
-#   define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
-#   define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
-# endif
-# if defined(__fcc_version)
-#   define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
-# elif defined(__FCC_VERSION)
-#   define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
-# endif
-
-
-#elif defined(__ghs__)
-# define COMPILER_ID "GHS"
-/* __GHS_VERSION_NUMBER = VVVVRP */
-# ifdef __GHS_VERSION_NUMBER
-# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
-# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
-# endif
-
-#elif defined(__TINYC__)
-# define COMPILER_ID "TinyCC"
-
-#elif defined(__BCC__)
-# define COMPILER_ID "Bruce"
-
-#elif defined(__SCO_VERSION__)
-# define COMPILER_ID "SCO"
-
-#elif defined(__ARMCC_VERSION) && !defined(__clang__)
-# define COMPILER_ID "ARMCC"
-#if __ARMCC_VERSION >= 1000000
-  /* __ARMCC_VERSION = VRRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
-#else
-  /* __ARMCC_VERSION = VRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
-#endif
-
-
-#elif defined(__clang__) && defined(__apple_build_version__)
-# define COMPILER_ID "AppleClang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
-
-#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
-# define COMPILER_ID "ARMClang"
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION     % 10000)
-# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
-
-#elif defined(__clang__)
-# define COMPILER_ID "Clang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-
-#elif defined(__GNUC__)
-# define COMPILER_ID "GNU"
-# define COMPILER_VERSION_MAJOR DEC(__GNUC__)
-# if defined(__GNUC_MINOR__)
-#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif defined(_MSC_VER)
-# define COMPILER_ID "MSVC"
-  /* _MSC_VER = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
-# if defined(_MSC_FULL_VER)
-#  if _MSC_VER >= 1400
-    /* _MSC_FULL_VER = VVRRPPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
-#  else
-    /* _MSC_FULL_VER = VVRRPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
-#  endif
-# endif
-# if defined(_MSC_BUILD)
-#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
-# endif
-
-#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__)
-# define COMPILER_ID "ADSP"
-#if defined(__VISUALDSPVERSION__)
-  /* __VISUALDSPVERSION__ = 0xVVRRPP00 */
-# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24)
-# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF)
-# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8  & 0xFF)
-#endif
-
-#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
-# define COMPILER_ID "IAR"
-# if defined(__VER__) && defined(__ICCARM__)
-#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
-#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
-#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
-#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
-# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
-#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
-#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
-#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
-#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
-# endif
-
-#elif defined(__SDCC_VERSION_MAJOR) || defined(SDCC)
-# define COMPILER_ID "SDCC"
-# if defined(__SDCC_VERSION_MAJOR)
-#  define COMPILER_VERSION_MAJOR DEC(__SDCC_VERSION_MAJOR)
-#  define COMPILER_VERSION_MINOR DEC(__SDCC_VERSION_MINOR)
-#  define COMPILER_VERSION_PATCH DEC(__SDCC_VERSION_PATCH)
-# else
-  /* SDCC = VRP */
-#  define COMPILER_VERSION_MAJOR DEC(SDCC/100)
-#  define COMPILER_VERSION_MINOR DEC(SDCC/10 % 10)
-#  define COMPILER_VERSION_PATCH DEC(SDCC    % 10)
-# endif
-
-
-/* These compilers are either not known or too old to define an
-  identification macro.  Try to identify the platform and guess that
-  it is the native compiler.  */
-#elif defined(__hpux) || defined(__hpua)
-# define COMPILER_ID "HP"
-
-#else /* unknown compiler */
-# define COMPILER_ID ""
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
-#ifdef SIMULATE_ID
-char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
-#endif
-
-#ifdef __QNXNTO__
-char const* qnxnto = "INFO" ":" "qnxnto[]";
-#endif
-
-#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
-char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
-#endif
-
-#define STRINGIFY_HELPER(X) #X
-#define STRINGIFY(X) STRINGIFY_HELPER(X)
-
-/* Identify known platforms by name.  */
-#if defined(__linux) || defined(__linux__) || defined(linux)
-# define PLATFORM_ID "Linux"
-
-#elif defined(__MSYS__)
-# define PLATFORM_ID "MSYS"
-
-#elif defined(__CYGWIN__)
-# define PLATFORM_ID "Cygwin"
-
-#elif defined(__MINGW32__)
-# define PLATFORM_ID "MinGW"
-
-#elif defined(__APPLE__)
-# define PLATFORM_ID "Darwin"
-
-#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
-# define PLATFORM_ID "Windows"
-
-#elif defined(__FreeBSD__) || defined(__FreeBSD)
-# define PLATFORM_ID "FreeBSD"
-
-#elif defined(__NetBSD__) || defined(__NetBSD)
-# define PLATFORM_ID "NetBSD"
-
-#elif defined(__OpenBSD__) || defined(__OPENBSD)
-# define PLATFORM_ID "OpenBSD"
-
-#elif defined(__sun) || defined(sun)
-# define PLATFORM_ID "SunOS"
-
-#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
-# define PLATFORM_ID "AIX"
-
-#elif defined(__hpux) || defined(__hpux__)
-# define PLATFORM_ID "HP-UX"
-
-#elif defined(__HAIKU__)
-# define PLATFORM_ID "Haiku"
-
-#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
-# define PLATFORM_ID "BeOS"
-
-#elif defined(__QNX__) || defined(__QNXNTO__)
-# define PLATFORM_ID "QNX"
-
-#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
-# define PLATFORM_ID "Tru64"
-
-#elif defined(__riscos) || defined(__riscos__)
-# define PLATFORM_ID "RISCos"
-
-#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
-# define PLATFORM_ID "SINIX"
-
-#elif defined(__UNIX_SV__)
-# define PLATFORM_ID "UNIX_SV"
-
-#elif defined(__bsdos__)
-# define PLATFORM_ID "BSDOS"
-
-#elif defined(_MPRAS) || defined(MPRAS)
-# define PLATFORM_ID "MP-RAS"
-
-#elif defined(__osf) || defined(__osf__)
-# define PLATFORM_ID "OSF1"
-
-#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
-# define PLATFORM_ID "SCO_SV"
-
-#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
-# define PLATFORM_ID "ULTRIX"
-
-#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
-# define PLATFORM_ID "Xenix"
-
-#elif defined(__WATCOMC__)
-# if defined(__LINUX__)
-#  define PLATFORM_ID "Linux"
-
-# elif defined(__DOS__)
-#  define PLATFORM_ID "DOS"
-
-# elif defined(__OS2__)
-#  define PLATFORM_ID "OS2"
-
-# elif defined(__WINDOWS__)
-#  define PLATFORM_ID "Windows3x"
-
-# elif defined(__VXWORKS__)
-#  define PLATFORM_ID "VxWorks"
-
-# else /* unknown platform */
-#  define PLATFORM_ID
-# endif
-
-#elif defined(__INTEGRITY)
-# if defined(INT_178B)
-#  define PLATFORM_ID "Integrity178"
-
-# else /* regular Integrity */
-#  define PLATFORM_ID "Integrity"
-# endif
-
-#else /* unknown platform */
-# define PLATFORM_ID
-
-#endif
-
-/* For windows compilers MSVC and Intel we can determine
-   the architecture of the compiler being used.  This is because
-   the compilers do not have flags that can change the architecture,
-   but rather depend on which compiler is being used
-*/
-#if defined(_WIN32) && defined(_MSC_VER)
-# if defined(_M_IA64)
-#  define ARCHITECTURE_ID "IA64"
-
-# elif defined(_M_ARM64EC)
-#  define ARCHITECTURE_ID "ARM64EC"
-
-# elif defined(_M_X64) || defined(_M_AMD64)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# elif defined(_M_ARM64)
-#  define ARCHITECTURE_ID "ARM64"
-
-# elif defined(_M_ARM)
-#  if _M_ARM == 4
-#   define ARCHITECTURE_ID "ARMV4I"
-#  elif _M_ARM == 5
-#   define ARCHITECTURE_ID "ARMV5I"
-#  else
-#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
-#  endif
-
-# elif defined(_M_MIPS)
-#  define ARCHITECTURE_ID "MIPS"
-
-# elif defined(_M_SH)
-#  define ARCHITECTURE_ID "SHx"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__WATCOMC__)
-# if defined(_M_I86)
-#  define ARCHITECTURE_ID "I86"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
-# if defined(__ICCARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__ICCRX__)
-#  define ARCHITECTURE_ID "RX"
-
-# elif defined(__ICCRH850__)
-#  define ARCHITECTURE_ID "RH850"
-
-# elif defined(__ICCRL78__)
-#  define ARCHITECTURE_ID "RL78"
-
-# elif defined(__ICCRISCV__)
-#  define ARCHITECTURE_ID "RISCV"
-
-# elif defined(__ICCAVR__)
-#  define ARCHITECTURE_ID "AVR"
-
-# elif defined(__ICC430__)
-#  define ARCHITECTURE_ID "MSP430"
-
-# elif defined(__ICCV850__)
-#  define ARCHITECTURE_ID "V850"
-
-# elif defined(__ICC8051__)
-#  define ARCHITECTURE_ID "8051"
-
-# elif defined(__ICCSTM8__)
-#  define ARCHITECTURE_ID "STM8"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__ghs__)
-# if defined(__PPC64__)
-#  define ARCHITECTURE_ID "PPC64"
-
-# elif defined(__ppc__)
-#  define ARCHITECTURE_ID "PPC"
-
-# elif defined(__ARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__x86_64__)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(__i386__)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__TI_COMPILER_VERSION__)
-# if defined(__TI_ARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__MSP430__)
-#  define ARCHITECTURE_ID "MSP430"
-
-# elif defined(__TMS320C28XX__)
-#  define ARCHITECTURE_ID "TMS320C28x"
-
-# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
-#  define ARCHITECTURE_ID "TMS320C6x"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#else
-#  define ARCHITECTURE_ID
-#endif
-
-/* Convert integer to decimal digit literals.  */
-#define DEC(n)                   \
-  ('0' + (((n) / 10000000)%10)), \
-  ('0' + (((n) / 1000000)%10)),  \
-  ('0' + (((n) / 100000)%10)),   \
-  ('0' + (((n) / 10000)%10)),    \
-  ('0' + (((n) / 1000)%10)),     \
-  ('0' + (((n) / 100)%10)),      \
-  ('0' + (((n) / 10)%10)),       \
-  ('0' +  ((n) % 10))
-
-/* Convert integer to hex digit literals.  */
-#define HEX(n)             \
-  ('0' + ((n)>>28 & 0xF)), \
-  ('0' + ((n)>>24 & 0xF)), \
-  ('0' + ((n)>>20 & 0xF)), \
-  ('0' + ((n)>>16 & 0xF)), \
-  ('0' + ((n)>>12 & 0xF)), \
-  ('0' + ((n)>>8  & 0xF)), \
-  ('0' + ((n)>>4  & 0xF)), \
-  ('0' + ((n)     & 0xF))
-
-/* Construct a string literal encoding the version number. */
-#ifdef COMPILER_VERSION
-char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
-
-/* Construct a string literal encoding the version number components. */
-#elif defined(COMPILER_VERSION_MAJOR)
-char const info_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
-  COMPILER_VERSION_MAJOR,
-# ifdef COMPILER_VERSION_MINOR
-  '.', COMPILER_VERSION_MINOR,
-#  ifdef COMPILER_VERSION_PATCH
-   '.', COMPILER_VERSION_PATCH,
-#   ifdef COMPILER_VERSION_TWEAK
-    '.', COMPILER_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct a string literal encoding the internal version number. */
-#ifdef COMPILER_VERSION_INTERNAL
-char const info_version_internal[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
-  'i','n','t','e','r','n','a','l','[',
-  COMPILER_VERSION_INTERNAL,']','\0'};
-#elif defined(COMPILER_VERSION_INTERNAL_STR)
-char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
-#endif
-
-/* Construct a string literal encoding the version number components. */
-#ifdef SIMULATE_VERSION_MAJOR
-char const info_simulate_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
-  SIMULATE_VERSION_MAJOR,
-# ifdef SIMULATE_VERSION_MINOR
-  '.', SIMULATE_VERSION_MINOR,
-#  ifdef SIMULATE_VERSION_PATCH
-   '.', SIMULATE_VERSION_PATCH,
-#   ifdef SIMULATE_VERSION_TWEAK
-    '.', SIMULATE_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
-char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
-
-
-
-#if !defined(__STDC__) && !defined(__clang__)
-# if defined(_MSC_VER) || defined(__ibmxl__) || defined(__IBMC__)
-#  define C_VERSION "90"
-# else
-#  define C_VERSION
-# endif
-#elif __STDC_VERSION__ > 201710L
-# define C_VERSION "23"
-#elif __STDC_VERSION__ >= 201710L
-# define C_VERSION "17"
-#elif __STDC_VERSION__ >= 201000L
-# define C_VERSION "11"
-#elif __STDC_VERSION__ >= 199901L
-# define C_VERSION "99"
-#else
-# define C_VERSION "90"
-#endif
-const char* info_language_standard_default =
-  "INFO" ":" "standard_default[" C_VERSION "]";
-
-const char* info_language_extensions_default = "INFO" ":" "extensions_default["
-/* !defined(_MSC_VER) to exclude Clang's MSVC compatibility mode. */
-#if (defined(__clang__) || defined(__GNUC__) ||                               \
-     defined(__TI_COMPILER_VERSION__)) &&                                     \
-  !defined(__STRICT_ANSI__) && !defined(_MSC_VER)
-  "ON"
-#else
-  "OFF"
-#endif
-"]";
-
-/*--------------------------------------------------------------------------*/
-
-#ifdef ID_VOID_MAIN
-void main() {}
-#else
-# if defined(__CLASSIC_C__)
-int main(argc, argv) int argc; char *argv[];
-# else
-int main(int argc, char* argv[])
-# endif
-{
-  int require = 0;
-  require += info_compiler[argc];
-  require += info_platform[argc];
-  require += info_arch[argc];
-#ifdef COMPILER_VERSION_MAJOR
-  require += info_version[argc];
-#endif
-#ifdef COMPILER_VERSION_INTERNAL
-  require += info_version_internal[argc];
-#endif
-#ifdef SIMULATE_ID
-  require += info_simulate[argc];
-#endif
-#ifdef SIMULATE_VERSION_MAJOR
-  require += info_simulate_version[argc];
-#endif
-#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
-  require += info_cray[argc];
-#endif
-  require += info_language_standard_default[argc];
-  require += info_language_extensions_default[argc];
-  (void)argv;
-  return require;
-}
-#endif
diff --git a/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/a.out b/examples/sycl/CMakeFiles/3.22.1/CompilerIdC/a.out
deleted file mode 100755
index 30f4c1c1a2b997c05c3def9a97c52e99170eb848..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16256
zcmeHOeQXrR6`wns;3R~z9jIMCm(+<&f%Tm+;1WL8_=CO59|R0?Wyp4Yx3*8*hr74e
zoN3h%kt)zA5lN*=l_HftB30@iQBl(>id0HWMN(C2%|8vTA{8Pc5vfV2nzksST;H2{
z?^rK(`k_eqXUE$8&3p5HZ)V@jt!H;<p6u=Ki$)@XQ&j8{C>iyJRH-w_ol+!Sr+7dZ
zVzpQ$?g33L9EL1GR5`}WQB}ZLM|L%U$ZN)oL}L!Fl9GTLRN@vQBJWNzD66o%NQsQ1
zUdhOlMI;NlKE|_D0i&NM>(i7Up&2Se_!{vT{XC3Kv#!jWcPSpDL2)q&nTI0sT8Y<6
zJjS0=eT*DWY$G0>6wl5Y10<9cYA8U`<XMOyZ-jUwWY5U<j2!y|?|JF}r@no}t6irW
zQoGKO8ODuDEf~4J$AE`%`EL_<^M0ytp?<g?Q63pd6m8ke*w&7=Y<g2RlP{Gwl{>d?
z+S;KP3wpaGV7ssm98-G-4~pn<aUONd=W>*>9216^yBiU?Sgx+?Dag?n1-#_^E+-WW
zoX}(AJZ0!ln@7KT0X@cfKKnk<KLp2J$x03nSZT*~CNf3Oafb)GvxU4fY>#CfVObNo
zLf$HRw(D7zkfsY5rpmT8p2^$U%rWFav{S^(*_pfmIfudCin)}^?d?zQ>9*SSc72QB
z{N@g5U=*T=e;D5gNZ$bD{x7Y_+>4R_7U^Dn;Rg`zsCWisOqX{X60r|3cAT#G^xS7y
zcHO6^pNyp3@acKXV!=(Ho^DOjK0JjH2qO?iAdEm5fiMDL1i}b}5%@eJ@VD5SPs|g4
zsWYEl^v5<K%<rD}qLo?m#INfv$mUeGybJ2e!`I*(TWdgW26E%4xvj`ge2_eK>2dSa
zyXJ{&H}(zpCe9~*W4`%jyaCLY7s34FaXq&7X-P4k+j$i<=5vewgz%M{-g2n=5ayxk
zdn;FBYmcMh1wybtEA6-Z1Y!Mc^VALVjSqL5Z``agBNxp}x4mWH;9+u5SGh7CTibJo
zkL^3YGY7?@wBew6VrLYy=Bf9*CFZj`*MM++eWg;lo`%!KMZF+LjzHbj`jP+s6HrL)
ziM?{T<<?XEEw_y05Rg}fTW&$^e};#3Gxht}xmt+h@~a@l&MmtyG6O~LNTIeacJAzH
z%@C1`kw1X;7gg;Gv#3SO*QIVpUF7Y^KSBH%nM7`0sP<8{SKt!D6h<J7Kp25A0$~Kg
z2!s&`BM?R)j6fKHFalu&{)Y(Q_p;<*-_Z6{AvcxDI__vP?>X82{zC&tR6#M5D`joZ
z8HLZ7q?!VGyx`_W`|;Z-DQ!13Im#T*&ZliR4Ic`}?NZhoO>CnQr|dcTVy2KUR?CJ4
zkKAeSoVHlsbX>F@jYDdWw|D56!BoxosM3k#bCc)rfmP+VGe3d+$jX{WTLJOA-Kvi&
zmBWA=0r8E+4nTOFAU58pR9*sH1_!3dvHc=au8*u-vbgRvd^kk?DmZR@SgAaM20e@A
zC&(7a;y23IKra<b>-&~Ae>Hah6LrVM?iG)Id2@^W%!+)h_YBzaw|NldcT5lPB(wuD
zOko7V2!s&`BM?R)j6fKHFalu&!U%*B_^czK!N*%nS7@CyuZ^6LT6q2;#I=@~mMDM+
z7DMJ9Ao*qT^F@+*{qsta`B`TL$=WVM=I~hx6PN$}cBO!FxyC?HYq<gAwFZi`h8CWm
zsQf;u72-w0`w5!~mlHBn*K!($R4wL$MM~y&lt~YZN@Zeyw=1fBv(z5=Vl3Osae~K3
zDbG+j(LZ<Kb>#~+M5B1r!e?qss|n+Ty@Zbu9wpq{-Mw9FJvdg%dnGNA(BaX0Q=%lr
z#CO`^`MYC1sfB#u(7J!U2J48s_9eBRecc;2w85cu(>n=GJ^Sl%t(9{+WHu+_iEZm;
zvlcvI{A<NuD?tP7KU^^!T<8Aa0`_Y_AB9~HQ#8KWpGNvK0sUsu^Y4f4laPssW-($I
zQpMu~*K<e2gM!DuEQHgy;lO(N_sIH_kco&!!M`Wg{}?j!UGGBI^}6)aAYNT(u!}0r
z=jSbHzf1(j%_YgI7rc(2om`XlUl4+>m-U}OrjZ37HbFo%pUl}2sc#a^0Y95TUoV=f
z@nrk$pl^&W6@33;y#e|L5fixV0#my@3vr121pTK#uaS)(T=Wxdg6$xNUoD<QJ^1H+
zBDGZdywo>|PO|6w$xEc)7tsF_^z|@Kz8>&@81#*}`-aAweAfhZ%#feffS+0N!{dz`
z`wscRMLL-H-O^R`1J@O_|A73P0sl*)^T+2A&^JK8!~^ynpnpi#IQKLJ`o<{SwFAGS
zgP_NF;^G-h++Sl--vq--={cU_K+5$JdI9zn*kc*XvnK>7ie71aTu+HR`xY!OXQgoG
zLJ=&}1#2Q(7_+lh+AFw4%Py4#z7Neho|D$&^IPEdg^Xppu03rzdC#2|<F1`^taK@t
zn+A$cvcS@tW7<7nA9cF<{+3Kb+bpYZf7d{-)jQaOdnbCnHrO?g>;^TwceQ#=Dm8od
z3u|xx(4MY-YpAd9K<}_M+_k5_7Z|}!6ozk;#Ha4saMHGC3wb>vlOHcw!S|=&8%@*(
z4G`w;^03myf;9;*d~pXzatN%_nY>jhI%(yN-xez=Yv(6Q_Jm{I{U%sI9ErnkqoJ~~
zVv#DAJ3}lBwwzFqWMiuiVcW@^m(HKE|76ZpLLL6yDAlmxJ`>q<s^Ro5S_$~FTL`^4
zo%8H5K+jb;$+--?MR%u!o-cTgK9Mi!Q*L3(alL6$b?r%R!oexaChg*+(9_d-@Tbso
zRS8!r%}B7I%yqIh5`>OJOXyQsPoF42#)Gg59fu|u6=cWi&Lj=S$uv+|QdBi66@&BO
z*Un{9(1L=8o}o!NKy>H{p+nE)V7SX4<7aTfc_7?c;D4M-%{7M1@V-Ev2U4-L<vik-
z;J~>W^E-*p``|Pv^QY>4f|OeU{d}Ge@P0VWPli0-&sp6AnSa3Cj_WT|{bia^wvZw3
z{R!4T2<138W&RoBpCLXkw7dswB5*LD=beW@ho;Qu`EZ;X`XZG16Z-I{$3eikD)V`M
zIzxP(*K_;Xj`1k)aqi19&wpo$AE(NK?dLWegAAH7U(kG;C-^+y=K6!z|1@y%n$W*Y
z{QpCKY!QN=Gq{+V#iF<dIX_<`zD9h4sy)k$-wW{L#E+8!x0fBT%=rBPpZ~8L#7_r0
zjO%58KMe4BzVmW}vf$@1whvzakBDE5zh)>?O+oZO)Q|H`G+{o^|9F2V?l1JGVEsRb
zLO(yg&XDHub&7RTLG{dWW5ur^vlI^cxm4%Fyx%iu&obk0feW*>Dxc@y8Z5uX#CA%$
zBeVW(D8RAD_4B-#_wVvPT^^sz<9>V|xOn}{=lzJ6+dv7aRYCR4e8wwK;^)JJNZG~}
z@O?r_cenq2DDd-n{@YA^gNcM(FRMR-4C=W3JZ~H!e$XE4@~4kMKpmM^i<po17k^^D
zA9?Kg?cU=0X~5xqpb-vk561<c1G(*f{%nVmvH|B@FWchJY(D;uZOY<bY)Ba7{~NAG
BSUms$

diff --git a/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/CMakeCXXCompilerId.cpp b/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/CMakeCXXCompilerId.cpp
deleted file mode 100644
index 25c62a8c3cb40..0000000000000
--- a/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/CMakeCXXCompilerId.cpp
+++ /dev/null
@@ -1,791 +0,0 @@
-/* This source file must have a .cpp extension so that all C++ compilers
-   recognize the extension without flags.  Borland does not know .cxx for
-   example.  */
-#ifndef __cplusplus
-# error "A C compiler has been selected for C++."
-#endif
-
-#if !defined(__has_include)
-/* If the compiler does not have __has_include, pretend the answer is
-   always no.  */
-#  define __has_include(x) 0
-#endif
-
-
-/* Version number components: V=Version, R=Revision, P=Patch
-   Version date components:   YYYY=Year, MM=Month,   DD=Day  */
-
-#if defined(__COMO__)
-# define COMPILER_ID "Comeau"
-  /* __COMO_VERSION__ = VRR */
-# define COMPILER_VERSION_MAJOR DEC(__COMO_VERSION__ / 100)
-# define COMPILER_VERSION_MINOR DEC(__COMO_VERSION__ % 100)
-
-#elif defined(__INTEL_COMPILER) || defined(__ICC)
-# define COMPILER_ID "Intel"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# if defined(__GNUC__)
-#  define SIMULATE_ID "GNU"
-# endif
-  /* __INTEL_COMPILER = VRP prior to 2021, and then VVVV for 2021 and later,
-     except that a few beta releases use the old format with V=2021.  */
-# if __INTEL_COMPILER < 2021 || __INTEL_COMPILER == 202110 || __INTEL_COMPILER == 202111
-#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER/100)
-#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER/10 % 10)
-#  if defined(__INTEL_COMPILER_UPDATE)
-#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER_UPDATE)
-#  else
-#   define COMPILER_VERSION_PATCH DEC(__INTEL_COMPILER   % 10)
-#  endif
-# else
-#  define COMPILER_VERSION_MAJOR DEC(__INTEL_COMPILER)
-#  define COMPILER_VERSION_MINOR DEC(__INTEL_COMPILER_UPDATE)
-   /* The third version component from --version is an update index,
-      but no macro is provided for it.  */
-#  define COMPILER_VERSION_PATCH DEC(0)
-# endif
-# if defined(__INTEL_COMPILER_BUILD_DATE)
-   /* __INTEL_COMPILER_BUILD_DATE = YYYYMMDD */
-#  define COMPILER_VERSION_TWEAK DEC(__INTEL_COMPILER_BUILD_DATE)
-# endif
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# if defined(__GNUC__)
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-# elif defined(__GNUG__)
-#  define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
-# endif
-# if defined(__GNUC_MINOR__)
-#  define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif (defined(__clang__) && defined(__INTEL_CLANG_COMPILER)) || defined(__INTEL_LLVM_COMPILER)
-# define COMPILER_ID "IntelLLVM"
-#if defined(_MSC_VER)
-# define SIMULATE_ID "MSVC"
-#endif
-#if defined(__GNUC__)
-# define SIMULATE_ID "GNU"
-#endif
-/* __INTEL_LLVM_COMPILER = VVVVRP prior to 2021.2.0, VVVVRRPP for 2021.2.0 and
- * later.  Look for 6 digit vs. 8 digit version number to decide encoding.
- * VVVV is no smaller than the current year when a version is released.
- */
-#if __INTEL_LLVM_COMPILER < 1000000L
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/100)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER    % 10)
-#else
-# define COMPILER_VERSION_MAJOR DEC(__INTEL_LLVM_COMPILER/10000)
-# define COMPILER_VERSION_MINOR DEC(__INTEL_LLVM_COMPILER/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__INTEL_LLVM_COMPILER     % 100)
-#endif
-#if defined(_MSC_VER)
-  /* _MSC_VER = VVRR */
-# define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-#endif
-#if defined(__GNUC__)
-# define SIMULATE_VERSION_MAJOR DEC(__GNUC__)
-#elif defined(__GNUG__)
-# define SIMULATE_VERSION_MAJOR DEC(__GNUG__)
-#endif
-#if defined(__GNUC_MINOR__)
-# define SIMULATE_VERSION_MINOR DEC(__GNUC_MINOR__)
-#endif
-#if defined(__GNUC_PATCHLEVEL__)
-# define SIMULATE_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-#endif
-
-#elif defined(__PATHCC__)
-# define COMPILER_ID "PathScale"
-# define COMPILER_VERSION_MAJOR DEC(__PATHCC__)
-# define COMPILER_VERSION_MINOR DEC(__PATHCC_MINOR__)
-# if defined(__PATHCC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PATHCC_PATCHLEVEL__)
-# endif
-
-#elif defined(__BORLANDC__) && defined(__CODEGEARC_VERSION__)
-# define COMPILER_ID "Embarcadero"
-# define COMPILER_VERSION_MAJOR HEX(__CODEGEARC_VERSION__>>24 & 0x00FF)
-# define COMPILER_VERSION_MINOR HEX(__CODEGEARC_VERSION__>>16 & 0x00FF)
-# define COMPILER_VERSION_PATCH DEC(__CODEGEARC_VERSION__     & 0xFFFF)
-
-#elif defined(__BORLANDC__)
-# define COMPILER_ID "Borland"
-  /* __BORLANDC__ = 0xVRR */
-# define COMPILER_VERSION_MAJOR HEX(__BORLANDC__>>8)
-# define COMPILER_VERSION_MINOR HEX(__BORLANDC__ & 0xFF)
-
-#elif defined(__WATCOMC__) && __WATCOMC__ < 1200
-# define COMPILER_ID "Watcom"
-   /* __WATCOMC__ = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(__WATCOMC__ / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__WATCOMC__)
-# define COMPILER_ID "OpenWatcom"
-   /* __WATCOMC__ = VVRP + 1100 */
-# define COMPILER_VERSION_MAJOR DEC((__WATCOMC__ - 1100) / 100)
-# define COMPILER_VERSION_MINOR DEC((__WATCOMC__ / 10) % 10)
-# if (__WATCOMC__ % 10) > 0
-#  define COMPILER_VERSION_PATCH DEC(__WATCOMC__ % 10)
-# endif
-
-#elif defined(__SUNPRO_CC)
-# define COMPILER_ID "SunPro"
-# if __SUNPRO_CC >= 0x5100
-   /* __SUNPRO_CC = 0xVRRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>12)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xFF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
-# else
-   /* __SUNPRO_CC = 0xVRP */
-#  define COMPILER_VERSION_MAJOR HEX(__SUNPRO_CC>>8)
-#  define COMPILER_VERSION_MINOR HEX(__SUNPRO_CC>>4 & 0xF)
-#  define COMPILER_VERSION_PATCH HEX(__SUNPRO_CC    & 0xF)
-# endif
-
-#elif defined(__HP_aCC)
-# define COMPILER_ID "HP"
-  /* __HP_aCC = VVRRPP */
-# define COMPILER_VERSION_MAJOR DEC(__HP_aCC/10000)
-# define COMPILER_VERSION_MINOR DEC(__HP_aCC/100 % 100)
-# define COMPILER_VERSION_PATCH DEC(__HP_aCC     % 100)
-
-#elif defined(__DECCXX)
-# define COMPILER_ID "Compaq"
-  /* __DECCXX_VER = VVRRTPPPP */
-# define COMPILER_VERSION_MAJOR DEC(__DECCXX_VER/10000000)
-# define COMPILER_VERSION_MINOR DEC(__DECCXX_VER/100000  % 100)
-# define COMPILER_VERSION_PATCH DEC(__DECCXX_VER         % 10000)
-
-#elif defined(__IBMCPP__) && defined(__COMPILER_VER__)
-# define COMPILER_ID "zOS"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__ibmxl__) && defined(__clang__)
-# define COMPILER_ID "XLClang"
-# define COMPILER_VERSION_MAJOR DEC(__ibmxl_version__)
-# define COMPILER_VERSION_MINOR DEC(__ibmxl_release__)
-# define COMPILER_VERSION_PATCH DEC(__ibmxl_modification__)
-# define COMPILER_VERSION_TWEAK DEC(__ibmxl_ptf_fix_level__)
-
-
-#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ >= 800
-# define COMPILER_ID "XL"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__IBMCPP__) && !defined(__COMPILER_VER__) && __IBMCPP__ < 800
-# define COMPILER_ID "VisualAge"
-  /* __IBMCPP__ = VRP */
-# define COMPILER_VERSION_MAJOR DEC(__IBMCPP__/100)
-# define COMPILER_VERSION_MINOR DEC(__IBMCPP__/10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__IBMCPP__    % 10)
-
-#elif defined(__NVCOMPILER)
-# define COMPILER_ID "NVHPC"
-# define COMPILER_VERSION_MAJOR DEC(__NVCOMPILER_MAJOR__)
-# define COMPILER_VERSION_MINOR DEC(__NVCOMPILER_MINOR__)
-# if defined(__NVCOMPILER_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__NVCOMPILER_PATCHLEVEL__)
-# endif
-
-#elif defined(__PGI)
-# define COMPILER_ID "PGI"
-# define COMPILER_VERSION_MAJOR DEC(__PGIC__)
-# define COMPILER_VERSION_MINOR DEC(__PGIC_MINOR__)
-# if defined(__PGIC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__PGIC_PATCHLEVEL__)
-# endif
-
-#elif defined(_CRAYC)
-# define COMPILER_ID "Cray"
-# define COMPILER_VERSION_MAJOR DEC(_RELEASE_MAJOR)
-# define COMPILER_VERSION_MINOR DEC(_RELEASE_MINOR)
-
-#elif defined(__TI_COMPILER_VERSION__)
-# define COMPILER_ID "TI"
-  /* __TI_COMPILER_VERSION__ = VVVRRRPPP */
-# define COMPILER_VERSION_MAJOR DEC(__TI_COMPILER_VERSION__/1000000)
-# define COMPILER_VERSION_MINOR DEC(__TI_COMPILER_VERSION__/1000   % 1000)
-# define COMPILER_VERSION_PATCH DEC(__TI_COMPILER_VERSION__        % 1000)
-
-#elif defined(__CLANG_FUJITSU)
-# define COMPILER_ID "FujitsuClang"
-# define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
-# define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
-# define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
-# define COMPILER_VERSION_INTERNAL_STR __clang_version__
-
-
-#elif defined(__FUJITSU)
-# define COMPILER_ID "Fujitsu"
-# if defined(__FCC_version__)
-#   define COMPILER_VERSION __FCC_version__
-# elif defined(__FCC_major__)
-#   define COMPILER_VERSION_MAJOR DEC(__FCC_major__)
-#   define COMPILER_VERSION_MINOR DEC(__FCC_minor__)
-#   define COMPILER_VERSION_PATCH DEC(__FCC_patchlevel__)
-# endif
-# if defined(__fcc_version)
-#   define COMPILER_VERSION_INTERNAL DEC(__fcc_version)
-# elif defined(__FCC_VERSION)
-#   define COMPILER_VERSION_INTERNAL DEC(__FCC_VERSION)
-# endif
-
-
-#elif defined(__ghs__)
-# define COMPILER_ID "GHS"
-/* __GHS_VERSION_NUMBER = VVVVRP */
-# ifdef __GHS_VERSION_NUMBER
-# define COMPILER_VERSION_MAJOR DEC(__GHS_VERSION_NUMBER / 100)
-# define COMPILER_VERSION_MINOR DEC(__GHS_VERSION_NUMBER / 10 % 10)
-# define COMPILER_VERSION_PATCH DEC(__GHS_VERSION_NUMBER      % 10)
-# endif
-
-#elif defined(__SCO_VERSION__)
-# define COMPILER_ID "SCO"
-
-#elif defined(__ARMCC_VERSION) && !defined(__clang__)
-# define COMPILER_ID "ARMCC"
-#if __ARMCC_VERSION >= 1000000
-  /* __ARMCC_VERSION = VRRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION     % 10000)
-#else
-  /* __ARMCC_VERSION = VRPPPP */
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCC_VERSION/100000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCC_VERSION/10000 % 10)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCC_VERSION    % 10000)
-#endif
-
-
-#elif defined(__clang__) && defined(__apple_build_version__)
-# define COMPILER_ID "AppleClang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-# define COMPILER_VERSION_TWEAK DEC(__apple_build_version__)
-
-#elif defined(__clang__) && defined(__ARMCOMPILER_VERSION)
-# define COMPILER_ID "ARMClang"
-  # define COMPILER_VERSION_MAJOR DEC(__ARMCOMPILER_VERSION/1000000)
-  # define COMPILER_VERSION_MINOR DEC(__ARMCOMPILER_VERSION/10000 % 100)
-  # define COMPILER_VERSION_PATCH DEC(__ARMCOMPILER_VERSION     % 10000)
-# define COMPILER_VERSION_INTERNAL DEC(__ARMCOMPILER_VERSION)
-
-#elif defined(__clang__)
-# define COMPILER_ID "Clang"
-# if defined(_MSC_VER)
-#  define SIMULATE_ID "MSVC"
-# endif
-# define COMPILER_VERSION_MAJOR DEC(__clang_major__)
-# define COMPILER_VERSION_MINOR DEC(__clang_minor__)
-# define COMPILER_VERSION_PATCH DEC(__clang_patchlevel__)
-# if defined(_MSC_VER)
-   /* _MSC_VER = VVRR */
-#  define SIMULATE_VERSION_MAJOR DEC(_MSC_VER / 100)
-#  define SIMULATE_VERSION_MINOR DEC(_MSC_VER % 100)
-# endif
-
-#elif defined(__GNUC__) || defined(__GNUG__)
-# define COMPILER_ID "GNU"
-# if defined(__GNUC__)
-#  define COMPILER_VERSION_MAJOR DEC(__GNUC__)
-# else
-#  define COMPILER_VERSION_MAJOR DEC(__GNUG__)
-# endif
-# if defined(__GNUC_MINOR__)
-#  define COMPILER_VERSION_MINOR DEC(__GNUC_MINOR__)
-# endif
-# if defined(__GNUC_PATCHLEVEL__)
-#  define COMPILER_VERSION_PATCH DEC(__GNUC_PATCHLEVEL__)
-# endif
-
-#elif defined(_MSC_VER)
-# define COMPILER_ID "MSVC"
-  /* _MSC_VER = VVRR */
-# define COMPILER_VERSION_MAJOR DEC(_MSC_VER / 100)
-# define COMPILER_VERSION_MINOR DEC(_MSC_VER % 100)
-# if defined(_MSC_FULL_VER)
-#  if _MSC_VER >= 1400
-    /* _MSC_FULL_VER = VVRRPPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 100000)
-#  else
-    /* _MSC_FULL_VER = VVRRPPPP */
-#   define COMPILER_VERSION_PATCH DEC(_MSC_FULL_VER % 10000)
-#  endif
-# endif
-# if defined(_MSC_BUILD)
-#  define COMPILER_VERSION_TWEAK DEC(_MSC_BUILD)
-# endif
-
-#elif defined(__VISUALDSPVERSION__) || defined(__ADSPBLACKFIN__) || defined(__ADSPTS__) || defined(__ADSP21000__)
-# define COMPILER_ID "ADSP"
-#if defined(__VISUALDSPVERSION__)
-  /* __VISUALDSPVERSION__ = 0xVVRRPP00 */
-# define COMPILER_VERSION_MAJOR HEX(__VISUALDSPVERSION__>>24)
-# define COMPILER_VERSION_MINOR HEX(__VISUALDSPVERSION__>>16 & 0xFF)
-# define COMPILER_VERSION_PATCH HEX(__VISUALDSPVERSION__>>8  & 0xFF)
-#endif
-
-#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
-# define COMPILER_ID "IAR"
-# if defined(__VER__) && defined(__ICCARM__)
-#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 1000000)
-#  define COMPILER_VERSION_MINOR DEC(((__VER__) / 1000) % 1000)
-#  define COMPILER_VERSION_PATCH DEC((__VER__) % 1000)
-#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
-# elif defined(__VER__) && (defined(__ICCAVR__) || defined(__ICCRX__) || defined(__ICCRH850__) || defined(__ICCRL78__) || defined(__ICC430__) || defined(__ICCRISCV__) || defined(__ICCV850__) || defined(__ICC8051__) || defined(__ICCSTM8__))
-#  define COMPILER_VERSION_MAJOR DEC((__VER__) / 100)
-#  define COMPILER_VERSION_MINOR DEC((__VER__) - (((__VER__) / 100)*100))
-#  define COMPILER_VERSION_PATCH DEC(__SUBVERSION__)
-#  define COMPILER_VERSION_INTERNAL DEC(__IAR_SYSTEMS_ICC__)
-# endif
-
-
-/* These compilers are either not known or too old to define an
-  identification macro.  Try to identify the platform and guess that
-  it is the native compiler.  */
-#elif defined(__hpux) || defined(__hpua)
-# define COMPILER_ID "HP"
-
-#else /* unknown compiler */
-# define COMPILER_ID ""
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_compiler = "INFO" ":" "compiler[" COMPILER_ID "]";
-#ifdef SIMULATE_ID
-char const* info_simulate = "INFO" ":" "simulate[" SIMULATE_ID "]";
-#endif
-
-#ifdef __QNXNTO__
-char const* qnxnto = "INFO" ":" "qnxnto[]";
-#endif
-
-#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
-char const *info_cray = "INFO" ":" "compiler_wrapper[CrayPrgEnv]";
-#endif
-
-#define STRINGIFY_HELPER(X) #X
-#define STRINGIFY(X) STRINGIFY_HELPER(X)
-
-/* Identify known platforms by name.  */
-#if defined(__linux) || defined(__linux__) || defined(linux)
-# define PLATFORM_ID "Linux"
-
-#elif defined(__MSYS__)
-# define PLATFORM_ID "MSYS"
-
-#elif defined(__CYGWIN__)
-# define PLATFORM_ID "Cygwin"
-
-#elif defined(__MINGW32__)
-# define PLATFORM_ID "MinGW"
-
-#elif defined(__APPLE__)
-# define PLATFORM_ID "Darwin"
-
-#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32)
-# define PLATFORM_ID "Windows"
-
-#elif defined(__FreeBSD__) || defined(__FreeBSD)
-# define PLATFORM_ID "FreeBSD"
-
-#elif defined(__NetBSD__) || defined(__NetBSD)
-# define PLATFORM_ID "NetBSD"
-
-#elif defined(__OpenBSD__) || defined(__OPENBSD)
-# define PLATFORM_ID "OpenBSD"
-
-#elif defined(__sun) || defined(sun)
-# define PLATFORM_ID "SunOS"
-
-#elif defined(_AIX) || defined(__AIX) || defined(__AIX__) || defined(__aix) || defined(__aix__)
-# define PLATFORM_ID "AIX"
-
-#elif defined(__hpux) || defined(__hpux__)
-# define PLATFORM_ID "HP-UX"
-
-#elif defined(__HAIKU__)
-# define PLATFORM_ID "Haiku"
-
-#elif defined(__BeOS) || defined(__BEOS__) || defined(_BEOS)
-# define PLATFORM_ID "BeOS"
-
-#elif defined(__QNX__) || defined(__QNXNTO__)
-# define PLATFORM_ID "QNX"
-
-#elif defined(__tru64) || defined(_tru64) || defined(__TRU64__)
-# define PLATFORM_ID "Tru64"
-
-#elif defined(__riscos) || defined(__riscos__)
-# define PLATFORM_ID "RISCos"
-
-#elif defined(__sinix) || defined(__sinix__) || defined(__SINIX__)
-# define PLATFORM_ID "SINIX"
-
-#elif defined(__UNIX_SV__)
-# define PLATFORM_ID "UNIX_SV"
-
-#elif defined(__bsdos__)
-# define PLATFORM_ID "BSDOS"
-
-#elif defined(_MPRAS) || defined(MPRAS)
-# define PLATFORM_ID "MP-RAS"
-
-#elif defined(__osf) || defined(__osf__)
-# define PLATFORM_ID "OSF1"
-
-#elif defined(_SCO_SV) || defined(SCO_SV) || defined(sco_sv)
-# define PLATFORM_ID "SCO_SV"
-
-#elif defined(__ultrix) || defined(__ultrix__) || defined(_ULTRIX)
-# define PLATFORM_ID "ULTRIX"
-
-#elif defined(__XENIX__) || defined(_XENIX) || defined(XENIX)
-# define PLATFORM_ID "Xenix"
-
-#elif defined(__WATCOMC__)
-# if defined(__LINUX__)
-#  define PLATFORM_ID "Linux"
-
-# elif defined(__DOS__)
-#  define PLATFORM_ID "DOS"
-
-# elif defined(__OS2__)
-#  define PLATFORM_ID "OS2"
-
-# elif defined(__WINDOWS__)
-#  define PLATFORM_ID "Windows3x"
-
-# elif defined(__VXWORKS__)
-#  define PLATFORM_ID "VxWorks"
-
-# else /* unknown platform */
-#  define PLATFORM_ID
-# endif
-
-#elif defined(__INTEGRITY)
-# if defined(INT_178B)
-#  define PLATFORM_ID "Integrity178"
-
-# else /* regular Integrity */
-#  define PLATFORM_ID "Integrity"
-# endif
-
-#else /* unknown platform */
-# define PLATFORM_ID
-
-#endif
-
-/* For windows compilers MSVC and Intel we can determine
-   the architecture of the compiler being used.  This is because
-   the compilers do not have flags that can change the architecture,
-   but rather depend on which compiler is being used
-*/
-#if defined(_WIN32) && defined(_MSC_VER)
-# if defined(_M_IA64)
-#  define ARCHITECTURE_ID "IA64"
-
-# elif defined(_M_ARM64EC)
-#  define ARCHITECTURE_ID "ARM64EC"
-
-# elif defined(_M_X64) || defined(_M_AMD64)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# elif defined(_M_ARM64)
-#  define ARCHITECTURE_ID "ARM64"
-
-# elif defined(_M_ARM)
-#  if _M_ARM == 4
-#   define ARCHITECTURE_ID "ARMV4I"
-#  elif _M_ARM == 5
-#   define ARCHITECTURE_ID "ARMV5I"
-#  else
-#   define ARCHITECTURE_ID "ARMV" STRINGIFY(_M_ARM)
-#  endif
-
-# elif defined(_M_MIPS)
-#  define ARCHITECTURE_ID "MIPS"
-
-# elif defined(_M_SH)
-#  define ARCHITECTURE_ID "SHx"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__WATCOMC__)
-# if defined(_M_I86)
-#  define ARCHITECTURE_ID "I86"
-
-# elif defined(_M_IX86)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__IAR_SYSTEMS_ICC__) || defined(__IAR_SYSTEMS_ICC)
-# if defined(__ICCARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__ICCRX__)
-#  define ARCHITECTURE_ID "RX"
-
-# elif defined(__ICCRH850__)
-#  define ARCHITECTURE_ID "RH850"
-
-# elif defined(__ICCRL78__)
-#  define ARCHITECTURE_ID "RL78"
-
-# elif defined(__ICCRISCV__)
-#  define ARCHITECTURE_ID "RISCV"
-
-# elif defined(__ICCAVR__)
-#  define ARCHITECTURE_ID "AVR"
-
-# elif defined(__ICC430__)
-#  define ARCHITECTURE_ID "MSP430"
-
-# elif defined(__ICCV850__)
-#  define ARCHITECTURE_ID "V850"
-
-# elif defined(__ICC8051__)
-#  define ARCHITECTURE_ID "8051"
-
-# elif defined(__ICCSTM8__)
-#  define ARCHITECTURE_ID "STM8"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__ghs__)
-# if defined(__PPC64__)
-#  define ARCHITECTURE_ID "PPC64"
-
-# elif defined(__ppc__)
-#  define ARCHITECTURE_ID "PPC"
-
-# elif defined(__ARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__x86_64__)
-#  define ARCHITECTURE_ID "x64"
-
-# elif defined(__i386__)
-#  define ARCHITECTURE_ID "X86"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#elif defined(__TI_COMPILER_VERSION__)
-# if defined(__TI_ARM__)
-#  define ARCHITECTURE_ID "ARM"
-
-# elif defined(__MSP430__)
-#  define ARCHITECTURE_ID "MSP430"
-
-# elif defined(__TMS320C28XX__)
-#  define ARCHITECTURE_ID "TMS320C28x"
-
-# elif defined(__TMS320C6X__) || defined(_TMS320C6X)
-#  define ARCHITECTURE_ID "TMS320C6x"
-
-# else /* unknown architecture */
-#  define ARCHITECTURE_ID ""
-# endif
-
-#else
-#  define ARCHITECTURE_ID
-#endif
-
-/* Convert integer to decimal digit literals.  */
-#define DEC(n)                   \
-  ('0' + (((n) / 10000000)%10)), \
-  ('0' + (((n) / 1000000)%10)),  \
-  ('0' + (((n) / 100000)%10)),   \
-  ('0' + (((n) / 10000)%10)),    \
-  ('0' + (((n) / 1000)%10)),     \
-  ('0' + (((n) / 100)%10)),      \
-  ('0' + (((n) / 10)%10)),       \
-  ('0' +  ((n) % 10))
-
-/* Convert integer to hex digit literals.  */
-#define HEX(n)             \
-  ('0' + ((n)>>28 & 0xF)), \
-  ('0' + ((n)>>24 & 0xF)), \
-  ('0' + ((n)>>20 & 0xF)), \
-  ('0' + ((n)>>16 & 0xF)), \
-  ('0' + ((n)>>12 & 0xF)), \
-  ('0' + ((n)>>8  & 0xF)), \
-  ('0' + ((n)>>4  & 0xF)), \
-  ('0' + ((n)     & 0xF))
-
-/* Construct a string literal encoding the version number. */
-#ifdef COMPILER_VERSION
-char const* info_version = "INFO" ":" "compiler_version[" COMPILER_VERSION "]";
-
-/* Construct a string literal encoding the version number components. */
-#elif defined(COMPILER_VERSION_MAJOR)
-char const info_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','[',
-  COMPILER_VERSION_MAJOR,
-# ifdef COMPILER_VERSION_MINOR
-  '.', COMPILER_VERSION_MINOR,
-#  ifdef COMPILER_VERSION_PATCH
-   '.', COMPILER_VERSION_PATCH,
-#   ifdef COMPILER_VERSION_TWEAK
-    '.', COMPILER_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct a string literal encoding the internal version number. */
-#ifdef COMPILER_VERSION_INTERNAL
-char const info_version_internal[] = {
-  'I', 'N', 'F', 'O', ':',
-  'c','o','m','p','i','l','e','r','_','v','e','r','s','i','o','n','_',
-  'i','n','t','e','r','n','a','l','[',
-  COMPILER_VERSION_INTERNAL,']','\0'};
-#elif defined(COMPILER_VERSION_INTERNAL_STR)
-char const* info_version_internal = "INFO" ":" "compiler_version_internal[" COMPILER_VERSION_INTERNAL_STR "]";
-#endif
-
-/* Construct a string literal encoding the version number components. */
-#ifdef SIMULATE_VERSION_MAJOR
-char const info_simulate_version[] = {
-  'I', 'N', 'F', 'O', ':',
-  's','i','m','u','l','a','t','e','_','v','e','r','s','i','o','n','[',
-  SIMULATE_VERSION_MAJOR,
-# ifdef SIMULATE_VERSION_MINOR
-  '.', SIMULATE_VERSION_MINOR,
-#  ifdef SIMULATE_VERSION_PATCH
-   '.', SIMULATE_VERSION_PATCH,
-#   ifdef SIMULATE_VERSION_TWEAK
-    '.', SIMULATE_VERSION_TWEAK,
-#   endif
-#  endif
-# endif
-  ']','\0'};
-#endif
-
-/* Construct the string literal in pieces to prevent the source from
-   getting matched.  Store it in a pointer rather than an array
-   because some compilers will just produce instructions to fill the
-   array rather than assigning a pointer to a static array.  */
-char const* info_platform = "INFO" ":" "platform[" PLATFORM_ID "]";
-char const* info_arch = "INFO" ":" "arch[" ARCHITECTURE_ID "]";
-
-
-
-#if defined(__INTEL_COMPILER) && defined(_MSVC_LANG) && _MSVC_LANG < 201403L
-#  if defined(__INTEL_CXX11_MODE__)
-#    if defined(__cpp_aggregate_nsdmi)
-#      define CXX_STD 201402L
-#    else
-#      define CXX_STD 201103L
-#    endif
-#  else
-#    define CXX_STD 199711L
-#  endif
-#elif defined(_MSC_VER) && defined(_MSVC_LANG)
-#  define CXX_STD _MSVC_LANG
-#else
-#  define CXX_STD __cplusplus
-#endif
-
-const char* info_language_standard_default = "INFO" ":" "standard_default["
-#if CXX_STD > 202002L
-  "23"
-#elif CXX_STD > 201703L
-  "20"
-#elif CXX_STD >= 201703L
-  "17"
-#elif CXX_STD >= 201402L
-  "14"
-#elif CXX_STD >= 201103L
-  "11"
-#else
-  "98"
-#endif
-"]";
-
-const char* info_language_extensions_default = "INFO" ":" "extensions_default["
-/* !defined(_MSC_VER) to exclude Clang's MSVC compatibility mode. */
-#if (defined(__clang__) || defined(__GNUC__) ||                               \
-     defined(__TI_COMPILER_VERSION__)) &&                                     \
-  !defined(__STRICT_ANSI__) && !defined(_MSC_VER)
-  "ON"
-#else
-  "OFF"
-#endif
-"]";
-
-/*--------------------------------------------------------------------------*/
-
-int main(int argc, char* argv[])
-{
-  int require = 0;
-  require += info_compiler[argc];
-  require += info_platform[argc];
-#ifdef COMPILER_VERSION_MAJOR
-  require += info_version[argc];
-#endif
-#ifdef COMPILER_VERSION_INTERNAL
-  require += info_version_internal[argc];
-#endif
-#ifdef SIMULATE_ID
-  require += info_simulate[argc];
-#endif
-#ifdef SIMULATE_VERSION_MAJOR
-  require += info_simulate_version[argc];
-#endif
-#if defined(__CRAYXT_COMPUTE_LINUX_TARGET)
-  require += info_cray[argc];
-#endif
-  require += info_language_standard_default[argc];
-  require += info_language_extensions_default[argc];
-  (void)argv;
-  return require;
-}
diff --git a/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/a.out b/examples/sycl/CMakeFiles/3.22.1/CompilerIdCXX/a.out
deleted file mode 100755
index 5bbad36aa9e184b4c3bc102bcdf2b1644295c98f..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 16264
zcmeHOYit}>6~1FT#BI~86Jp}#(M*D=Y#dK^H%?+&l8oa=#+B{7oMLd2$#{2cud)wo
zceb@d1&-39gpeu-2?Ys>2!s&)C_*Yye}%MxgogyqABgw?Zj?$XkJgBS3<_D!x%ZsS
zc(UdZA@OIfHFNIy?)mPybLQ^&&YgQd+uhewrD=jwmDnXvGwVzVsk0DPyHp8l7mo{5
ztQYIV2LP#o!;~c`RgUG&DpkO;mh7rQBCi26Qko0oI;jcppc1zbCGzekld?))T%uH#
zqE6|^lSQNpx;~cQBnK@0Jn2tY^$6Wm8IrFPkENf7v6*jG=JPuhkL7WS%Q#8qp^Cg(
zL-A^NDIUui^2d_niEYHAo#NSEt#~YTD8r=7Gt^I#c|*h-A|6Y&XDMR=+(U1B$%pZ`
zpLjJJRRd}^nli(3vw{Un_IChy7?=Mv@ohdx{ua9rwG@Ar8addKPK|7D+mcSUrc>GC
zWb0)6_SWrfMj>a!r3JPNePEy3J8)Q3O^YjtF`vs($8t;<V!qudk&ET(x=ulk`Y6a7
z?$0w4SmcB?uB=doZPF{?=%*5najt~-0KN&12a;`%WpY`&;5m8EwuRk0*l#D@ygQaE
zcy4~MzcZc7x`WP0+ErzhOYFp?V~?h?PC9iG3~h*Zg6hmTsjL9K;3X3+Eg13bsAkH!
zu>_c+7DGIt@V$M#dphm7v9(QbHg0Ug)>lJ}@lQn_<tu=4|9I_-Uxx5ki;IL6*Q6f7
zyYvmDLb|!zloIa|JzAtb>%%d&obLE=zSmeb=fmlzCMokioclW070NIIVFbbmgb@fM
z5Jn)3Kp25A0{`m>{5kUE->uVssI|^5d20*2=D&Q!t18V{r+-*`Rk|u|y9IFRBe&rk
z*<eEM>yR70!0kYO?2o-?ZydAE-m*^Lp4&gz9lH|yiFN&YbOo5#e;Ul+A2lKyUXm2+
z{7&iT{E}Zof%UO@uO3_<!#uctq%<4ZFpY*+NrL_Nu<-o0@1SVDXPuq1-gtMn^~QX)
zrCqab-1AmJ1+7#;ZE1EivZ3ofAKN#*GXuq<*mBr9z4I2nSX*b`_Lf=ac0LLCo#s-h
zbSDX?YfHL8*N%ekvVY{i^8yr7dm^tMX}o)?uko%q4FP#|u<<T<|1H#MC4LdPTmx~e
ze+#t8<yDVrZ$>Vkx}clFI=Agd5^GzoUDJLE=!`_`ua_;Mk~gJNQ>)Erb5}+AzLf6-
zG&@XT1i}b}5eOp?Mj(tp7=bVXVFbbmgb@fM5Juo3BY@w-dIx&;J)Ov9CQ@lPKir%3
z+;m^xbNxqEK_QhXrX9~6hR>J;O@Ka{%V&oB@Y^VXPChX{%pA|jCY^i|z6_2!#k4mZ
z+d(Dnq~~S}sa&>DF55S7^nT+Pw8ehj&7<vb6jD4IZ!<81shsgqWhIi&O<uqUR+%4X
zehm5A+UjSTK;rkeb?=o*M?h`{i8B@-2MN;&V)I<7^i`0n;J~DvJSeouI&JN;rL`B}
z10dqKR$=blQt2r)=vpd2F*ZWB4UX3Vm%#G6p5+bCMjm^ic3SLS^UO!LHf}<5<fGq9
zV8`F)5l4;(#~EnH7y@AmBM?R)j6fKHFalu&!U%*B2qO?iAdJ9&837$W&SILSb<n(q
zk>4>`e_mRtb*h-14(P9tzKkTVr+%FDi{$4C(s}*!TGIJlXASdrnKFm(Qkb~>FZW70
z)F-P=6y>#;(Hf=W`b<ohsQgi-itmzqjAR4JdXg;4Ycfq!LQAPYU!rtw$0XsfNK_`S
z?_No@Z;sjnUyNmYIgar9DD_z?C;I<g@VfHF8lqD?YT%1Cru8JFB)dr-AbEo1-p<aa
z^`^rk#jID<V=)6>y<1~Nsm4ANhxhNcW`c#BaA-Q%ti!_Lj{UuQ*Z$6ymMw7Toirz*
zt9O4RsyA`YfXvodG`6EzHfzxn#=l1VsT4H8`=`OW-{3m;4=!T=B;ZxB>tTZT4N!$T
z;g<sVR>JxBMdqRxGMZ=*L#8Pq?jN|GTNB`&`oAoM(~EGRU;e!^eg-m{SSk4T#Q3)$
zQ|WpyOZ$kx^$3{a@+`y*NKHfpu1LVd_+oMRJ=kbsg?MeFiRKGZCBJ`@7IlKx_cLJ`
zT(}0a9>M-sFYylvLGfYx7QpMo>T=u|f1*kV9q@*LeH`ppz;*F{jT$0!0KSRIK6wQ2
zl@MoKw1SDBgAUeHwHm5XxGaQIR@T2-v=h$HhZhMq1Ni5#{;K7IpNCw}7Z7K~C%;Yg
z><`rQQ^K19`1&f<ev|s)Q&iVY=@*`cgtLp^U_EeM+z%Q59n~KV)IU$S6~I@6U%W5D
z_%}+tK^!N0uD_Y^px<qP>ultc2NuDPFM^9hiM$sxa<Hes8A;imGbR8ic*W6CBO&f@
zRIt5_oxrUM1+Yx!?6Gui#7WypFPAUaPH|G;d}!MB+@uk$Y=PSsQnr)NJ5#os_3~3<
zH1A|wJ6X(RrhwwpY_Rken0EF%C*01VA%4R3CXK|zgs?x=7mKH|qdA_rL?j-QNN`dU
zT=X35=<l|>2fA=0h7I$xcDF^K)pbzVU7sB2=<n?m_TIjIdpi2;eLX#gx(DsSjy-+d
zz;Kg}=fEzFeX#Q)nabM5f}4bT{97|j|7HrgGXtVk$l2pCzY9A&WW#tiSZO#7EM#n;
zs=&66a&53nLsHXDcC6@(x%Pw8!peYWrG*S3)Ce0)=z<pnB~TsgPFXPj3`D$EwZjB1
zJd4*kQYbL+uQst%*zZztPm3@LQyI@00qNzH9OqmL=G5~O!pP=4*BHwdjfs42!p(bA
z1a<7`ZN;uC%f_9;xG<7aS*TA*FRw~yI(&g*Ls{NUJ4g@)b~Is3q&;IS2N@6IDh%wX
zV3d=$+i=IJYmO&@%9^68KB^d;hkBh%DgjP&9@Y#^!fs<g_=ExXCj*_|_k(@NERF}^
z5d;70RBEU;Wrp_$@_3Mnr7h=Cei#lMvoXJ&_`DxZhcbWS=QU&7A>-!{5uf+P>1;@n
z|DQ121)0CX7?1r=5`U5gmW^b{dx(Pm2cR6ssLa1a{7b~YKztnMYH%>0$DhvuhNjHt
zadDIy%Hv=#kqPVYr(>Ytn3efFPMsw_kMFttY{&8h@No>xI*)_rh#w_q!S-_-PC^Dv
zna|^HogDJGoBao`|0Uq!HDUcS@&7ygfC7o%H<YG)aUSJuIQ)E__&V`PlqsyU{Ct2P
zC4Q6)xV>Bf>ny(%;Pd}+llY$q+%Wdb^?fD4=W)ix70QC&$Jm~33#;O5#OL_)xJl^B
zr0U=J$8jf`FrUYJyw4N&8TwPu|M#KL&yQ|2rFnFtVx3W<eCD_@@B_#!hl74D<#93Z
z`wZH%&hlr#h0$V}Uml0Us$5KLr?mSz<G+Fe?0f8=$F;nVm-p*(|70HDM|_XQ>t{ai
zJG{9CKuAqWl+Vm(ISVCzzA#N?8&!gz6H0rq{qI15pU>mD2I8AcB*}gmeh)H;ar=3E
zH$?oPJ^J#e_d!97%*#d0$LEVbG2bt_@A>V1%>Jpv;d21LwQ_qnF8DskZTItM+LU%O
YP&xZ$Tl|?-;y=4XS^R?yNe21<1ipxBpa1{>

diff --git a/examples/sycl/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/CMakeFiles/Makefile.cmake b/examples/sycl/CMakeFiles/Makefile.cmake
deleted file mode 100644
index b7ba0b4b841a3..0000000000000
--- a/examples/sycl/CMakeFiles/Makefile.cmake
+++ /dev/null
@@ -1,130 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# The generator used is:
-set(CMAKE_DEPENDS_GENERATOR "Unix Makefiles")
-
-# The top level Makefile was generated from the following files:
-set(CMAKE_MAKEFILE_DEPENDS
-  "CMakeCache.txt"
-  "../CMakeLists.txt"
-  "../baby-llama/CMakeLists.txt"
-  "../batched-bench/CMakeLists.txt"
-  "../batched/CMakeLists.txt"
-  "../beam-search/CMakeLists.txt"
-  "../benchmark/CMakeLists.txt"
-  "../convert-llama2c-to-ggml/CMakeLists.txt"
-  "../embedding/CMakeLists.txt"
-  "../export-lora/CMakeLists.txt"
-  "../finetune/CMakeLists.txt"
-  "../infill/CMakeLists.txt"
-  "../llama-bench/CMakeLists.txt"
-  "../llava/CMakeLists.txt"
-  "../lookahead/CMakeLists.txt"
-  "../lookup/CMakeLists.txt"
-  "../main/CMakeLists.txt"
-  "../parallel/CMakeLists.txt"
-  "../perplexity/CMakeLists.txt"
-  "../quantize-stats/CMakeLists.txt"
-  "../quantize/CMakeLists.txt"
-  "../save-load-state/CMakeLists.txt"
-  "../simple/CMakeLists.txt"
-  "../speculative/CMakeLists.txt"
-  "CMakeFiles/3.22.1/CMakeCCompiler.cmake"
-  "CMakeFiles/3.22.1/CMakeCXXCompiler.cmake"
-  "CMakeFiles/3.22.1/CMakeSystem.cmake"
-  "CMakeLists.txt"
-  "../tokenize/CMakeLists.txt"
-  "../train-text-from-scratch/CMakeLists.txt"
-  "/usr/share/cmake-3.22/Modules/CMakeCInformation.cmake"
-  "/usr/share/cmake-3.22/Modules/CMakeCXXInformation.cmake"
-  "/usr/share/cmake-3.22/Modules/CMakeCommonLanguageInclude.cmake"
-  "/usr/share/cmake-3.22/Modules/CMakeGenericSystem.cmake"
-  "/usr/share/cmake-3.22/Modules/CMakeInitializeConfigs.cmake"
-  "/usr/share/cmake-3.22/Modules/CMakeLanguageInformation.cmake"
-  "/usr/share/cmake-3.22/Modules/CMakeSystemSpecificInformation.cmake"
-  "/usr/share/cmake-3.22/Modules/CMakeSystemSpecificInitialize.cmake"
-  "/usr/share/cmake-3.22/Modules/CheckCSourceCompiles.cmake"
-  "/usr/share/cmake-3.22/Modules/CheckIncludeFile.cmake"
-  "/usr/share/cmake-3.22/Modules/CheckLibraryExists.cmake"
-  "/usr/share/cmake-3.22/Modules/Compiler/CMakeCommonCompilerMacros.cmake"
-  "/usr/share/cmake-3.22/Modules/Compiler/IntelLLVM-C.cmake"
-  "/usr/share/cmake-3.22/Modules/Compiler/IntelLLVM-CXX.cmake"
-  "/usr/share/cmake-3.22/Modules/Compiler/IntelLLVM.cmake"
-  "/usr/share/cmake-3.22/Modules/FindPackageHandleStandardArgs.cmake"
-  "/usr/share/cmake-3.22/Modules/FindPackageMessage.cmake"
-  "/usr/share/cmake-3.22/Modules/FindThreads.cmake"
-  "/usr/share/cmake-3.22/Modules/Internal/CheckSourceCompiles.cmake"
-  "/usr/share/cmake-3.22/Modules/Platform/Linux-IntelLLVM-C.cmake"
-  "/usr/share/cmake-3.22/Modules/Platform/Linux-IntelLLVM-CXX.cmake"
-  "/usr/share/cmake-3.22/Modules/Platform/Linux-IntelLLVM.cmake"
-  "/usr/share/cmake-3.22/Modules/Platform/Linux.cmake"
-  "/usr/share/cmake-3.22/Modules/Platform/UnixPaths.cmake"
-  )
-
-# The corresponding makefile is:
-set(CMAKE_MAKEFILE_OUTPUTS
-  "Makefile"
-  "CMakeFiles/cmake.check_cache"
-  )
-
-# Byproducts of CMake generate step:
-set(CMAKE_MAKEFILE_PRODUCTS
-  "CMakeFiles/CMakeDirectoryInformation.cmake"
-  "baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "batched/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "beam-search/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "benchmark/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "embedding/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "finetune/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "infill/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "llava/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "sycl/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "main/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "tokenize/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "parallel/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "perplexity/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "quantize/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "simple/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "speculative/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "lookahead/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "lookup/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake"
-  "export-lora/CMakeFiles/CMakeDirectoryInformation.cmake"
-  )
-
-# Dependency information for all targets:
-set(CMAKE_DEPEND_INFO_FILES
-  "baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake"
-  "batched/CMakeFiles/batched.dir/DependInfo.cmake"
-  "batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake"
-  "beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake"
-  "benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake"
-  "convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake"
-  "embedding/CMakeFiles/embedding.dir/DependInfo.cmake"
-  "finetune/CMakeFiles/finetune.dir/DependInfo.cmake"
-  "infill/CMakeFiles/infill.dir/DependInfo.cmake"
-  "llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake"
-  "llava/CMakeFiles/llava.dir/DependInfo.cmake"
-  "llava/CMakeFiles/llava_static.dir/DependInfo.cmake"
-  "llava/CMakeFiles/llava-cli.dir/DependInfo.cmake"
-  "sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake"
-  "main/CMakeFiles/main.dir/DependInfo.cmake"
-  "tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake"
-  "parallel/CMakeFiles/parallel.dir/DependInfo.cmake"
-  "perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake"
-  "quantize/CMakeFiles/quantize.dir/DependInfo.cmake"
-  "quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake"
-  "save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake"
-  "simple/CMakeFiles/simple.dir/DependInfo.cmake"
-  "speculative/CMakeFiles/speculative.dir/DependInfo.cmake"
-  "lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake"
-  "lookup/CMakeFiles/lookup.dir/DependInfo.cmake"
-  "train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake"
-  "export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake"
-  )
diff --git a/examples/sycl/CMakeFiles/Makefile2 b/examples/sycl/CMakeFiles/Makefile2
deleted file mode 100644
index 27a69e727b838..0000000000000
--- a/examples/sycl/CMakeFiles/Makefile2
+++ /dev/null
@@ -1,1239 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Directory level rules for the build root directory
-
-# The main recursive "all" target.
-all: baby-llama/all
-all: batched/all
-all: batched-bench/all
-all: beam-search/all
-all: benchmark/all
-all: convert-llama2c-to-ggml/all
-all: embedding/all
-all: finetune/all
-all: infill/all
-all: llama-bench/all
-all: llava/all
-all: sycl/all
-all: main/all
-all: tokenize/all
-all: parallel/all
-all: perplexity/all
-all: quantize/all
-all: quantize-stats/all
-all: save-load-state/all
-all: simple/all
-all: speculative/all
-all: lookahead/all
-all: lookup/all
-all: train-text-from-scratch/all
-all: export-lora/all
-.PHONY : all
-
-# The main recursive "preinstall" target.
-preinstall: baby-llama/preinstall
-preinstall: batched/preinstall
-preinstall: batched-bench/preinstall
-preinstall: beam-search/preinstall
-preinstall: benchmark/preinstall
-preinstall: convert-llama2c-to-ggml/preinstall
-preinstall: embedding/preinstall
-preinstall: finetune/preinstall
-preinstall: infill/preinstall
-preinstall: llama-bench/preinstall
-preinstall: llava/preinstall
-preinstall: sycl/preinstall
-preinstall: main/preinstall
-preinstall: tokenize/preinstall
-preinstall: parallel/preinstall
-preinstall: perplexity/preinstall
-preinstall: quantize/preinstall
-preinstall: quantize-stats/preinstall
-preinstall: save-load-state/preinstall
-preinstall: simple/preinstall
-preinstall: speculative/preinstall
-preinstall: lookahead/preinstall
-preinstall: lookup/preinstall
-preinstall: train-text-from-scratch/preinstall
-preinstall: export-lora/preinstall
-.PHONY : preinstall
-
-# The main recursive "clean" target.
-clean: baby-llama/clean
-clean: batched/clean
-clean: batched-bench/clean
-clean: beam-search/clean
-clean: benchmark/clean
-clean: convert-llama2c-to-ggml/clean
-clean: embedding/clean
-clean: finetune/clean
-clean: infill/clean
-clean: llama-bench/clean
-clean: llava/clean
-clean: sycl/clean
-clean: main/clean
-clean: tokenize/clean
-clean: parallel/clean
-clean: perplexity/clean
-clean: quantize/clean
-clean: quantize-stats/clean
-clean: save-load-state/clean
-clean: simple/clean
-clean: speculative/clean
-clean: lookahead/clean
-clean: lookup/clean
-clean: train-text-from-scratch/clean
-clean: export-lora/clean
-.PHONY : clean
-
-#=============================================================================
-# Directory level rules for directory baby-llama
-
-# Recursive "all" directory target.
-baby-llama/all: baby-llama/CMakeFiles/baby-llama.dir/all
-.PHONY : baby-llama/all
-
-# Recursive "preinstall" directory target.
-baby-llama/preinstall:
-.PHONY : baby-llama/preinstall
-
-# Recursive "clean" directory target.
-baby-llama/clean: baby-llama/CMakeFiles/baby-llama.dir/clean
-.PHONY : baby-llama/clean
-
-#=============================================================================
-# Directory level rules for directory batched
-
-# Recursive "all" directory target.
-batched/all: batched/CMakeFiles/batched.dir/all
-.PHONY : batched/all
-
-# Recursive "preinstall" directory target.
-batched/preinstall:
-.PHONY : batched/preinstall
-
-# Recursive "clean" directory target.
-batched/clean: batched/CMakeFiles/batched.dir/clean
-.PHONY : batched/clean
-
-#=============================================================================
-# Directory level rules for directory batched-bench
-
-# Recursive "all" directory target.
-batched-bench/all: batched-bench/CMakeFiles/batched-bench.dir/all
-.PHONY : batched-bench/all
-
-# Recursive "preinstall" directory target.
-batched-bench/preinstall:
-.PHONY : batched-bench/preinstall
-
-# Recursive "clean" directory target.
-batched-bench/clean: batched-bench/CMakeFiles/batched-bench.dir/clean
-.PHONY : batched-bench/clean
-
-#=============================================================================
-# Directory level rules for directory beam-search
-
-# Recursive "all" directory target.
-beam-search/all: beam-search/CMakeFiles/beam-search.dir/all
-.PHONY : beam-search/all
-
-# Recursive "preinstall" directory target.
-beam-search/preinstall:
-.PHONY : beam-search/preinstall
-
-# Recursive "clean" directory target.
-beam-search/clean: beam-search/CMakeFiles/beam-search.dir/clean
-.PHONY : beam-search/clean
-
-#=============================================================================
-# Directory level rules for directory benchmark
-
-# Recursive "all" directory target.
-benchmark/all: benchmark/CMakeFiles/benchmark.dir/all
-.PHONY : benchmark/all
-
-# Recursive "preinstall" directory target.
-benchmark/preinstall:
-.PHONY : benchmark/preinstall
-
-# Recursive "clean" directory target.
-benchmark/clean: benchmark/CMakeFiles/benchmark.dir/clean
-.PHONY : benchmark/clean
-
-#=============================================================================
-# Directory level rules for directory convert-llama2c-to-ggml
-
-# Recursive "all" directory target.
-convert-llama2c-to-ggml/all: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/all
-.PHONY : convert-llama2c-to-ggml/all
-
-# Recursive "preinstall" directory target.
-convert-llama2c-to-ggml/preinstall:
-.PHONY : convert-llama2c-to-ggml/preinstall
-
-# Recursive "clean" directory target.
-convert-llama2c-to-ggml/clean: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean
-.PHONY : convert-llama2c-to-ggml/clean
-
-#=============================================================================
-# Directory level rules for directory embedding
-
-# Recursive "all" directory target.
-embedding/all: embedding/CMakeFiles/embedding.dir/all
-.PHONY : embedding/all
-
-# Recursive "preinstall" directory target.
-embedding/preinstall:
-.PHONY : embedding/preinstall
-
-# Recursive "clean" directory target.
-embedding/clean: embedding/CMakeFiles/embedding.dir/clean
-.PHONY : embedding/clean
-
-#=============================================================================
-# Directory level rules for directory export-lora
-
-# Recursive "all" directory target.
-export-lora/all: export-lora/CMakeFiles/export-lora.dir/all
-.PHONY : export-lora/all
-
-# Recursive "preinstall" directory target.
-export-lora/preinstall:
-.PHONY : export-lora/preinstall
-
-# Recursive "clean" directory target.
-export-lora/clean: export-lora/CMakeFiles/export-lora.dir/clean
-.PHONY : export-lora/clean
-
-#=============================================================================
-# Directory level rules for directory finetune
-
-# Recursive "all" directory target.
-finetune/all: finetune/CMakeFiles/finetune.dir/all
-.PHONY : finetune/all
-
-# Recursive "preinstall" directory target.
-finetune/preinstall:
-.PHONY : finetune/preinstall
-
-# Recursive "clean" directory target.
-finetune/clean: finetune/CMakeFiles/finetune.dir/clean
-.PHONY : finetune/clean
-
-#=============================================================================
-# Directory level rules for directory infill
-
-# Recursive "all" directory target.
-infill/all: infill/CMakeFiles/infill.dir/all
-.PHONY : infill/all
-
-# Recursive "preinstall" directory target.
-infill/preinstall:
-.PHONY : infill/preinstall
-
-# Recursive "clean" directory target.
-infill/clean: infill/CMakeFiles/infill.dir/clean
-.PHONY : infill/clean
-
-#=============================================================================
-# Directory level rules for directory llama-bench
-
-# Recursive "all" directory target.
-llama-bench/all: llama-bench/CMakeFiles/llama-bench.dir/all
-.PHONY : llama-bench/all
-
-# Recursive "preinstall" directory target.
-llama-bench/preinstall:
-.PHONY : llama-bench/preinstall
-
-# Recursive "clean" directory target.
-llama-bench/clean: llama-bench/CMakeFiles/llama-bench.dir/clean
-.PHONY : llama-bench/clean
-
-#=============================================================================
-# Directory level rules for directory llava
-
-# Recursive "all" directory target.
-llava/all: llava/CMakeFiles/llava.dir/all
-llava/all: llava/CMakeFiles/llava_static.dir/all
-llava/all: llava/CMakeFiles/llava-cli.dir/all
-.PHONY : llava/all
-
-# Recursive "preinstall" directory target.
-llava/preinstall:
-.PHONY : llava/preinstall
-
-# Recursive "clean" directory target.
-llava/clean: llava/CMakeFiles/llava.dir/clean
-llava/clean: llava/CMakeFiles/llava_static.dir/clean
-llava/clean: llava/CMakeFiles/llava-cli.dir/clean
-.PHONY : llava/clean
-
-#=============================================================================
-# Directory level rules for directory lookahead
-
-# Recursive "all" directory target.
-lookahead/all: lookahead/CMakeFiles/lookahead.dir/all
-.PHONY : lookahead/all
-
-# Recursive "preinstall" directory target.
-lookahead/preinstall:
-.PHONY : lookahead/preinstall
-
-# Recursive "clean" directory target.
-lookahead/clean: lookahead/CMakeFiles/lookahead.dir/clean
-.PHONY : lookahead/clean
-
-#=============================================================================
-# Directory level rules for directory lookup
-
-# Recursive "all" directory target.
-lookup/all: lookup/CMakeFiles/lookup.dir/all
-.PHONY : lookup/all
-
-# Recursive "preinstall" directory target.
-lookup/preinstall:
-.PHONY : lookup/preinstall
-
-# Recursive "clean" directory target.
-lookup/clean: lookup/CMakeFiles/lookup.dir/clean
-.PHONY : lookup/clean
-
-#=============================================================================
-# Directory level rules for directory main
-
-# Recursive "all" directory target.
-main/all: main/CMakeFiles/main.dir/all
-.PHONY : main/all
-
-# Recursive "preinstall" directory target.
-main/preinstall:
-.PHONY : main/preinstall
-
-# Recursive "clean" directory target.
-main/clean: main/CMakeFiles/main.dir/clean
-.PHONY : main/clean
-
-#=============================================================================
-# Directory level rules for directory parallel
-
-# Recursive "all" directory target.
-parallel/all: parallel/CMakeFiles/parallel.dir/all
-.PHONY : parallel/all
-
-# Recursive "preinstall" directory target.
-parallel/preinstall:
-.PHONY : parallel/preinstall
-
-# Recursive "clean" directory target.
-parallel/clean: parallel/CMakeFiles/parallel.dir/clean
-.PHONY : parallel/clean
-
-#=============================================================================
-# Directory level rules for directory perplexity
-
-# Recursive "all" directory target.
-perplexity/all: perplexity/CMakeFiles/perplexity.dir/all
-.PHONY : perplexity/all
-
-# Recursive "preinstall" directory target.
-perplexity/preinstall:
-.PHONY : perplexity/preinstall
-
-# Recursive "clean" directory target.
-perplexity/clean: perplexity/CMakeFiles/perplexity.dir/clean
-.PHONY : perplexity/clean
-
-#=============================================================================
-# Directory level rules for directory quantize
-
-# Recursive "all" directory target.
-quantize/all: quantize/CMakeFiles/quantize.dir/all
-.PHONY : quantize/all
-
-# Recursive "preinstall" directory target.
-quantize/preinstall:
-.PHONY : quantize/preinstall
-
-# Recursive "clean" directory target.
-quantize/clean: quantize/CMakeFiles/quantize.dir/clean
-.PHONY : quantize/clean
-
-#=============================================================================
-# Directory level rules for directory quantize-stats
-
-# Recursive "all" directory target.
-quantize-stats/all: quantize-stats/CMakeFiles/quantize-stats.dir/all
-.PHONY : quantize-stats/all
-
-# Recursive "preinstall" directory target.
-quantize-stats/preinstall:
-.PHONY : quantize-stats/preinstall
-
-# Recursive "clean" directory target.
-quantize-stats/clean: quantize-stats/CMakeFiles/quantize-stats.dir/clean
-.PHONY : quantize-stats/clean
-
-#=============================================================================
-# Directory level rules for directory save-load-state
-
-# Recursive "all" directory target.
-save-load-state/all: save-load-state/CMakeFiles/save-load-state.dir/all
-.PHONY : save-load-state/all
-
-# Recursive "preinstall" directory target.
-save-load-state/preinstall:
-.PHONY : save-load-state/preinstall
-
-# Recursive "clean" directory target.
-save-load-state/clean: save-load-state/CMakeFiles/save-load-state.dir/clean
-.PHONY : save-load-state/clean
-
-#=============================================================================
-# Directory level rules for directory simple
-
-# Recursive "all" directory target.
-simple/all: simple/CMakeFiles/simple.dir/all
-.PHONY : simple/all
-
-# Recursive "preinstall" directory target.
-simple/preinstall:
-.PHONY : simple/preinstall
-
-# Recursive "clean" directory target.
-simple/clean: simple/CMakeFiles/simple.dir/clean
-.PHONY : simple/clean
-
-#=============================================================================
-# Directory level rules for directory speculative
-
-# Recursive "all" directory target.
-speculative/all: speculative/CMakeFiles/speculative.dir/all
-.PHONY : speculative/all
-
-# Recursive "preinstall" directory target.
-speculative/preinstall:
-.PHONY : speculative/preinstall
-
-# Recursive "clean" directory target.
-speculative/clean: speculative/CMakeFiles/speculative.dir/clean
-.PHONY : speculative/clean
-
-#=============================================================================
-# Directory level rules for directory sycl
-
-# Recursive "all" directory target.
-sycl/all: sycl/CMakeFiles/ls-sycl-device.dir/all
-.PHONY : sycl/all
-
-# Recursive "preinstall" directory target.
-sycl/preinstall:
-.PHONY : sycl/preinstall
-
-# Recursive "clean" directory target.
-sycl/clean: sycl/CMakeFiles/ls-sycl-device.dir/clean
-.PHONY : sycl/clean
-
-#=============================================================================
-# Directory level rules for directory tokenize
-
-# Recursive "all" directory target.
-tokenize/all: tokenize/CMakeFiles/tokenize.dir/all
-.PHONY : tokenize/all
-
-# Recursive "preinstall" directory target.
-tokenize/preinstall:
-.PHONY : tokenize/preinstall
-
-# Recursive "clean" directory target.
-tokenize/clean: tokenize/CMakeFiles/tokenize.dir/clean
-.PHONY : tokenize/clean
-
-#=============================================================================
-# Directory level rules for directory train-text-from-scratch
-
-# Recursive "all" directory target.
-train-text-from-scratch/all: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/all
-.PHONY : train-text-from-scratch/all
-
-# Recursive "preinstall" directory target.
-train-text-from-scratch/preinstall:
-.PHONY : train-text-from-scratch/preinstall
-
-# Recursive "clean" directory target.
-train-text-from-scratch/clean: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean
-.PHONY : train-text-from-scratch/clean
-
-#=============================================================================
-# Target rules for target baby-llama/CMakeFiles/baby-llama.dir
-
-# All Build rule for target.
-baby-llama/CMakeFiles/baby-llama.dir/all:
-	$(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/depend
-	$(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=1,2 "Built target baby-llama"
-.PHONY : baby-llama/CMakeFiles/baby-llama.dir/all
-
-# Build rule for subdir invocation for target.
-baby-llama/CMakeFiles/baby-llama.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/CMakeFiles/baby-llama.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : baby-llama/CMakeFiles/baby-llama.dir/rule
-
-# Convenience name for target.
-baby-llama: baby-llama/CMakeFiles/baby-llama.dir/rule
-.PHONY : baby-llama
-
-# clean rule for target.
-baby-llama/CMakeFiles/baby-llama.dir/clean:
-	$(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/clean
-.PHONY : baby-llama/CMakeFiles/baby-llama.dir/clean
-
-#=============================================================================
-# Target rules for target batched/CMakeFiles/batched.dir
-
-# All Build rule for target.
-batched/CMakeFiles/batched.dir/all:
-	$(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/depend
-	$(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=3,4 "Built target batched"
-.PHONY : batched/CMakeFiles/batched.dir/all
-
-# Build rule for subdir invocation for target.
-batched/CMakeFiles/batched.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/CMakeFiles/batched.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : batched/CMakeFiles/batched.dir/rule
-
-# Convenience name for target.
-batched: batched/CMakeFiles/batched.dir/rule
-.PHONY : batched
-
-# clean rule for target.
-batched/CMakeFiles/batched.dir/clean:
-	$(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/clean
-.PHONY : batched/CMakeFiles/batched.dir/clean
-
-#=============================================================================
-# Target rules for target batched-bench/CMakeFiles/batched-bench.dir
-
-# All Build rule for target.
-batched-bench/CMakeFiles/batched-bench.dir/all:
-	$(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/depend
-	$(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=5,6 "Built target batched-bench"
-.PHONY : batched-bench/CMakeFiles/batched-bench.dir/all
-
-# Build rule for subdir invocation for target.
-batched-bench/CMakeFiles/batched-bench.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/CMakeFiles/batched-bench.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : batched-bench/CMakeFiles/batched-bench.dir/rule
-
-# Convenience name for target.
-batched-bench: batched-bench/CMakeFiles/batched-bench.dir/rule
-.PHONY : batched-bench
-
-# clean rule for target.
-batched-bench/CMakeFiles/batched-bench.dir/clean:
-	$(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/clean
-.PHONY : batched-bench/CMakeFiles/batched-bench.dir/clean
-
-#=============================================================================
-# Target rules for target beam-search/CMakeFiles/beam-search.dir
-
-# All Build rule for target.
-beam-search/CMakeFiles/beam-search.dir/all:
-	$(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/depend
-	$(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=7,8 "Built target beam-search"
-.PHONY : beam-search/CMakeFiles/beam-search.dir/all
-
-# Build rule for subdir invocation for target.
-beam-search/CMakeFiles/beam-search.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/CMakeFiles/beam-search.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : beam-search/CMakeFiles/beam-search.dir/rule
-
-# Convenience name for target.
-beam-search: beam-search/CMakeFiles/beam-search.dir/rule
-.PHONY : beam-search
-
-# clean rule for target.
-beam-search/CMakeFiles/beam-search.dir/clean:
-	$(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/clean
-.PHONY : beam-search/CMakeFiles/beam-search.dir/clean
-
-#=============================================================================
-# Target rules for target benchmark/CMakeFiles/benchmark.dir
-
-# All Build rule for target.
-benchmark/CMakeFiles/benchmark.dir/all:
-	$(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/depend
-	$(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=9,10 "Built target benchmark"
-.PHONY : benchmark/CMakeFiles/benchmark.dir/all
-
-# Build rule for subdir invocation for target.
-benchmark/CMakeFiles/benchmark.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/CMakeFiles/benchmark.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : benchmark/CMakeFiles/benchmark.dir/rule
-
-# Convenience name for target.
-benchmark: benchmark/CMakeFiles/benchmark.dir/rule
-.PHONY : benchmark
-
-# clean rule for target.
-benchmark/CMakeFiles/benchmark.dir/clean:
-	$(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/clean
-.PHONY : benchmark/CMakeFiles/benchmark.dir/clean
-
-#=============================================================================
-# Target rules for target convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir
-
-# All Build rule for target.
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/all:
-	$(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend
-	$(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=11,12 "Built target convert-llama2c-to-ggml"
-.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/all
-
-# Build rule for subdir invocation for target.
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
-
-# Convenience name for target.
-convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
-.PHONY : convert-llama2c-to-ggml
-
-# clean rule for target.
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean:
-	$(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean
-.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean
-
-#=============================================================================
-# Target rules for target embedding/CMakeFiles/embedding.dir
-
-# All Build rule for target.
-embedding/CMakeFiles/embedding.dir/all:
-	$(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/depend
-	$(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=13,14 "Built target embedding"
-.PHONY : embedding/CMakeFiles/embedding.dir/all
-
-# Build rule for subdir invocation for target.
-embedding/CMakeFiles/embedding.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/CMakeFiles/embedding.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : embedding/CMakeFiles/embedding.dir/rule
-
-# Convenience name for target.
-embedding: embedding/CMakeFiles/embedding.dir/rule
-.PHONY : embedding
-
-# clean rule for target.
-embedding/CMakeFiles/embedding.dir/clean:
-	$(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/clean
-.PHONY : embedding/CMakeFiles/embedding.dir/clean
-
-#=============================================================================
-# Target rules for target finetune/CMakeFiles/finetune.dir
-
-# All Build rule for target.
-finetune/CMakeFiles/finetune.dir/all:
-	$(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/depend
-	$(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=17,18 "Built target finetune"
-.PHONY : finetune/CMakeFiles/finetune.dir/all
-
-# Build rule for subdir invocation for target.
-finetune/CMakeFiles/finetune.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/CMakeFiles/finetune.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : finetune/CMakeFiles/finetune.dir/rule
-
-# Convenience name for target.
-finetune: finetune/CMakeFiles/finetune.dir/rule
-.PHONY : finetune
-
-# clean rule for target.
-finetune/CMakeFiles/finetune.dir/clean:
-	$(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/clean
-.PHONY : finetune/CMakeFiles/finetune.dir/clean
-
-#=============================================================================
-# Target rules for target infill/CMakeFiles/infill.dir
-
-# All Build rule for target.
-infill/CMakeFiles/infill.dir/all:
-	$(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/depend
-	$(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=19,20 "Built target infill"
-.PHONY : infill/CMakeFiles/infill.dir/all
-
-# Build rule for subdir invocation for target.
-infill/CMakeFiles/infill.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/CMakeFiles/infill.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : infill/CMakeFiles/infill.dir/rule
-
-# Convenience name for target.
-infill: infill/CMakeFiles/infill.dir/rule
-.PHONY : infill
-
-# clean rule for target.
-infill/CMakeFiles/infill.dir/clean:
-	$(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/clean
-.PHONY : infill/CMakeFiles/infill.dir/clean
-
-#=============================================================================
-# Target rules for target llama-bench/CMakeFiles/llama-bench.dir
-
-# All Build rule for target.
-llama-bench/CMakeFiles/llama-bench.dir/all:
-	$(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/depend
-	$(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=21,22 "Built target llama-bench"
-.PHONY : llama-bench/CMakeFiles/llama-bench.dir/all
-
-# Build rule for subdir invocation for target.
-llama-bench/CMakeFiles/llama-bench.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/CMakeFiles/llama-bench.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : llama-bench/CMakeFiles/llama-bench.dir/rule
-
-# Convenience name for target.
-llama-bench: llama-bench/CMakeFiles/llama-bench.dir/rule
-.PHONY : llama-bench
-
-# clean rule for target.
-llama-bench/CMakeFiles/llama-bench.dir/clean:
-	$(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/clean
-.PHONY : llama-bench/CMakeFiles/llama-bench.dir/clean
-
-#=============================================================================
-# Target rules for target llava/CMakeFiles/llava.dir
-
-# All Build rule for target.
-llava/CMakeFiles/llava.dir/all:
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/depend
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=23,24 "Built target llava"
-.PHONY : llava/CMakeFiles/llava.dir/all
-
-# Build rule for subdir invocation for target.
-llava/CMakeFiles/llava.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : llava/CMakeFiles/llava.dir/rule
-
-# Convenience name for target.
-llava: llava/CMakeFiles/llava.dir/rule
-.PHONY : llava
-
-# clean rule for target.
-llava/CMakeFiles/llava.dir/clean:
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/clean
-.PHONY : llava/CMakeFiles/llava.dir/clean
-
-#=============================================================================
-# Target rules for target llava/CMakeFiles/llava_static.dir
-
-# All Build rule for target.
-llava/CMakeFiles/llava_static.dir/all: llava/CMakeFiles/llava.dir/all
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/depend
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=27 "Built target llava_static"
-.PHONY : llava/CMakeFiles/llava_static.dir/all
-
-# Build rule for subdir invocation for target.
-llava/CMakeFiles/llava_static.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 3
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava_static.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : llava/CMakeFiles/llava_static.dir/rule
-
-# Convenience name for target.
-llava_static: llava/CMakeFiles/llava_static.dir/rule
-.PHONY : llava_static
-
-# clean rule for target.
-llava/CMakeFiles/llava_static.dir/clean:
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/clean
-.PHONY : llava/CMakeFiles/llava_static.dir/clean
-
-#=============================================================================
-# Target rules for target llava/CMakeFiles/llava-cli.dir
-
-# All Build rule for target.
-llava/CMakeFiles/llava-cli.dir/all: llava/CMakeFiles/llava.dir/all
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/depend
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=25,26 "Built target llava-cli"
-.PHONY : llava/CMakeFiles/llava-cli.dir/all
-
-# Build rule for subdir invocation for target.
-llava/CMakeFiles/llava-cli.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 4
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava-cli.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : llava/CMakeFiles/llava-cli.dir/rule
-
-# Convenience name for target.
-llava-cli: llava/CMakeFiles/llava-cli.dir/rule
-.PHONY : llava-cli
-
-# clean rule for target.
-llava/CMakeFiles/llava-cli.dir/clean:
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/clean
-.PHONY : llava/CMakeFiles/llava-cli.dir/clean
-
-#=============================================================================
-# Target rules for target sycl/CMakeFiles/ls-sycl-device.dir
-
-# All Build rule for target.
-sycl/CMakeFiles/ls-sycl-device.dir/all:
-	$(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/depend
-	$(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=32,33 "Built target ls-sycl-device"
-.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/all
-
-# Build rule for subdir invocation for target.
-sycl/CMakeFiles/ls-sycl-device.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/CMakeFiles/ls-sycl-device.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/rule
-
-# Convenience name for target.
-ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/rule
-.PHONY : ls-sycl-device
-
-# clean rule for target.
-sycl/CMakeFiles/ls-sycl-device.dir/clean:
-	$(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/clean
-.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/clean
-
-#=============================================================================
-# Target rules for target main/CMakeFiles/main.dir
-
-# All Build rule for target.
-main/CMakeFiles/main.dir/all:
-	$(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/depend
-	$(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=34,35 "Built target main"
-.PHONY : main/CMakeFiles/main.dir/all
-
-# Build rule for subdir invocation for target.
-main/CMakeFiles/main.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/CMakeFiles/main.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : main/CMakeFiles/main.dir/rule
-
-# Convenience name for target.
-main: main/CMakeFiles/main.dir/rule
-.PHONY : main
-
-# clean rule for target.
-main/CMakeFiles/main.dir/clean:
-	$(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/clean
-.PHONY : main/CMakeFiles/main.dir/clean
-
-#=============================================================================
-# Target rules for target tokenize/CMakeFiles/tokenize.dir
-
-# All Build rule for target.
-tokenize/CMakeFiles/tokenize.dir/all:
-	$(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/depend
-	$(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=50,51 "Built target tokenize"
-.PHONY : tokenize/CMakeFiles/tokenize.dir/all
-
-# Build rule for subdir invocation for target.
-tokenize/CMakeFiles/tokenize.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/CMakeFiles/tokenize.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : tokenize/CMakeFiles/tokenize.dir/rule
-
-# Convenience name for target.
-tokenize: tokenize/CMakeFiles/tokenize.dir/rule
-.PHONY : tokenize
-
-# clean rule for target.
-tokenize/CMakeFiles/tokenize.dir/clean:
-	$(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/clean
-.PHONY : tokenize/CMakeFiles/tokenize.dir/clean
-
-#=============================================================================
-# Target rules for target parallel/CMakeFiles/parallel.dir
-
-# All Build rule for target.
-parallel/CMakeFiles/parallel.dir/all:
-	$(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/depend
-	$(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=36,37 "Built target parallel"
-.PHONY : parallel/CMakeFiles/parallel.dir/all
-
-# Build rule for subdir invocation for target.
-parallel/CMakeFiles/parallel.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/CMakeFiles/parallel.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : parallel/CMakeFiles/parallel.dir/rule
-
-# Convenience name for target.
-parallel: parallel/CMakeFiles/parallel.dir/rule
-.PHONY : parallel
-
-# clean rule for target.
-parallel/CMakeFiles/parallel.dir/clean:
-	$(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/clean
-.PHONY : parallel/CMakeFiles/parallel.dir/clean
-
-#=============================================================================
-# Target rules for target perplexity/CMakeFiles/perplexity.dir
-
-# All Build rule for target.
-perplexity/CMakeFiles/perplexity.dir/all:
-	$(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/depend
-	$(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=38,39 "Built target perplexity"
-.PHONY : perplexity/CMakeFiles/perplexity.dir/all
-
-# Build rule for subdir invocation for target.
-perplexity/CMakeFiles/perplexity.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/CMakeFiles/perplexity.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : perplexity/CMakeFiles/perplexity.dir/rule
-
-# Convenience name for target.
-perplexity: perplexity/CMakeFiles/perplexity.dir/rule
-.PHONY : perplexity
-
-# clean rule for target.
-perplexity/CMakeFiles/perplexity.dir/clean:
-	$(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/clean
-.PHONY : perplexity/CMakeFiles/perplexity.dir/clean
-
-#=============================================================================
-# Target rules for target quantize/CMakeFiles/quantize.dir
-
-# All Build rule for target.
-quantize/CMakeFiles/quantize.dir/all:
-	$(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/depend
-	$(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=40,41 "Built target quantize"
-.PHONY : quantize/CMakeFiles/quantize.dir/all
-
-# Build rule for subdir invocation for target.
-quantize/CMakeFiles/quantize.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/CMakeFiles/quantize.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : quantize/CMakeFiles/quantize.dir/rule
-
-# Convenience name for target.
-quantize: quantize/CMakeFiles/quantize.dir/rule
-.PHONY : quantize
-
-# clean rule for target.
-quantize/CMakeFiles/quantize.dir/clean:
-	$(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/clean
-.PHONY : quantize/CMakeFiles/quantize.dir/clean
-
-#=============================================================================
-# Target rules for target quantize-stats/CMakeFiles/quantize-stats.dir
-
-# All Build rule for target.
-quantize-stats/CMakeFiles/quantize-stats.dir/all:
-	$(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/depend
-	$(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=42,43 "Built target quantize-stats"
-.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/all
-
-# Build rule for subdir invocation for target.
-quantize-stats/CMakeFiles/quantize-stats.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/CMakeFiles/quantize-stats.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/rule
-
-# Convenience name for target.
-quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/rule
-.PHONY : quantize-stats
-
-# clean rule for target.
-quantize-stats/CMakeFiles/quantize-stats.dir/clean:
-	$(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/clean
-.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/clean
-
-#=============================================================================
-# Target rules for target save-load-state/CMakeFiles/save-load-state.dir
-
-# All Build rule for target.
-save-load-state/CMakeFiles/save-load-state.dir/all:
-	$(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/depend
-	$(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=44,45 "Built target save-load-state"
-.PHONY : save-load-state/CMakeFiles/save-load-state.dir/all
-
-# Build rule for subdir invocation for target.
-save-load-state/CMakeFiles/save-load-state.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/CMakeFiles/save-load-state.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : save-load-state/CMakeFiles/save-load-state.dir/rule
-
-# Convenience name for target.
-save-load-state: save-load-state/CMakeFiles/save-load-state.dir/rule
-.PHONY : save-load-state
-
-# clean rule for target.
-save-load-state/CMakeFiles/save-load-state.dir/clean:
-	$(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/clean
-.PHONY : save-load-state/CMakeFiles/save-load-state.dir/clean
-
-#=============================================================================
-# Target rules for target simple/CMakeFiles/simple.dir
-
-# All Build rule for target.
-simple/CMakeFiles/simple.dir/all:
-	$(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/depend
-	$(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=46,47 "Built target simple"
-.PHONY : simple/CMakeFiles/simple.dir/all
-
-# Build rule for subdir invocation for target.
-simple/CMakeFiles/simple.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/CMakeFiles/simple.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : simple/CMakeFiles/simple.dir/rule
-
-# Convenience name for target.
-simple: simple/CMakeFiles/simple.dir/rule
-.PHONY : simple
-
-# clean rule for target.
-simple/CMakeFiles/simple.dir/clean:
-	$(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/clean
-.PHONY : simple/CMakeFiles/simple.dir/clean
-
-#=============================================================================
-# Target rules for target speculative/CMakeFiles/speculative.dir
-
-# All Build rule for target.
-speculative/CMakeFiles/speculative.dir/all:
-	$(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/depend
-	$(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=48,49 "Built target speculative"
-.PHONY : speculative/CMakeFiles/speculative.dir/all
-
-# Build rule for subdir invocation for target.
-speculative/CMakeFiles/speculative.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/CMakeFiles/speculative.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : speculative/CMakeFiles/speculative.dir/rule
-
-# Convenience name for target.
-speculative: speculative/CMakeFiles/speculative.dir/rule
-.PHONY : speculative
-
-# clean rule for target.
-speculative/CMakeFiles/speculative.dir/clean:
-	$(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/clean
-.PHONY : speculative/CMakeFiles/speculative.dir/clean
-
-#=============================================================================
-# Target rules for target lookahead/CMakeFiles/lookahead.dir
-
-# All Build rule for target.
-lookahead/CMakeFiles/lookahead.dir/all:
-	$(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/depend
-	$(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=28,29 "Built target lookahead"
-.PHONY : lookahead/CMakeFiles/lookahead.dir/all
-
-# Build rule for subdir invocation for target.
-lookahead/CMakeFiles/lookahead.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/CMakeFiles/lookahead.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : lookahead/CMakeFiles/lookahead.dir/rule
-
-# Convenience name for target.
-lookahead: lookahead/CMakeFiles/lookahead.dir/rule
-.PHONY : lookahead
-
-# clean rule for target.
-lookahead/CMakeFiles/lookahead.dir/clean:
-	$(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/clean
-.PHONY : lookahead/CMakeFiles/lookahead.dir/clean
-
-#=============================================================================
-# Target rules for target lookup/CMakeFiles/lookup.dir
-
-# All Build rule for target.
-lookup/CMakeFiles/lookup.dir/all:
-	$(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/depend
-	$(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=30,31 "Built target lookup"
-.PHONY : lookup/CMakeFiles/lookup.dir/all
-
-# Build rule for subdir invocation for target.
-lookup/CMakeFiles/lookup.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/CMakeFiles/lookup.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : lookup/CMakeFiles/lookup.dir/rule
-
-# Convenience name for target.
-lookup: lookup/CMakeFiles/lookup.dir/rule
-.PHONY : lookup
-
-# clean rule for target.
-lookup/CMakeFiles/lookup.dir/clean:
-	$(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/clean
-.PHONY : lookup/CMakeFiles/lookup.dir/clean
-
-#=============================================================================
-# Target rules for target train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir
-
-# All Build rule for target.
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/all:
-	$(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend
-	$(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=52,53 "Built target train-text-from-scratch"
-.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/all
-
-# Build rule for subdir invocation for target.
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
-
-# Convenience name for target.
-train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
-.PHONY : train-text-from-scratch
-
-# clean rule for target.
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean:
-	$(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean
-.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean
-
-#=============================================================================
-# Target rules for target export-lora/CMakeFiles/export-lora.dir
-
-# All Build rule for target.
-export-lora/CMakeFiles/export-lora.dir/all:
-	$(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/depend
-	$(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/build
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=15,16 "Built target export-lora"
-.PHONY : export-lora/CMakeFiles/export-lora.dir/all
-
-# Build rule for subdir invocation for target.
-export-lora/CMakeFiles/export-lora.dir/rule: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 2
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/CMakeFiles/export-lora.dir/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : export-lora/CMakeFiles/export-lora.dir/rule
-
-# Convenience name for target.
-export-lora: export-lora/CMakeFiles/export-lora.dir/rule
-.PHONY : export-lora
-
-# clean rule for target.
-export-lora/CMakeFiles/export-lora.dir/clean:
-	$(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/clean
-.PHONY : export-lora/CMakeFiles/export-lora.dir/clean
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/CMakeFiles/Progress/1 b/examples/sycl/CMakeFiles/Progress/1
deleted file mode 100644
index 7b4d68d70fcae..0000000000000
--- a/examples/sycl/CMakeFiles/Progress/1
+++ /dev/null
@@ -1 +0,0 @@
-empty
\ No newline at end of file
diff --git a/examples/sycl/CMakeFiles/Progress/count.txt b/examples/sycl/CMakeFiles/Progress/count.txt
deleted file mode 100644
index 59343b09ec765..0000000000000
--- a/examples/sycl/CMakeFiles/Progress/count.txt
+++ /dev/null
@@ -1 +0,0 @@
-53
diff --git a/examples/sycl/CMakeFiles/TargetDirectories.txt b/examples/sycl/CMakeFiles/TargetDirectories.txt
deleted file mode 100644
index 8c32b6fec862f..0000000000000
--- a/examples/sycl/CMakeFiles/TargetDirectories.txt
+++ /dev/null
@@ -1,183 +0,0 @@
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/batched.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/beam-search.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/benchmark.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/embedding.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/finetune.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/infill.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava_static.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava-cli.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/main.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/tokenize.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/parallel.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/perplexity.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/quantize.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/simple.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/speculative.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/lookahead.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/lookup.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/install/strip.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/export-lora.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/edit_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/rebuild_cache.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/list_install_components.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/install.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/install/local.dir
-/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/install/strip.dir
diff --git a/examples/sycl/CMakeFiles/cmake.check_cache b/examples/sycl/CMakeFiles/cmake.check_cache
deleted file mode 100644
index 3dccd731726d7..0000000000000
--- a/examples/sycl/CMakeFiles/cmake.check_cache
+++ /dev/null
@@ -1 +0,0 @@
-# This file is generated by cmake for dependency checking of the CMakeCache.txt file
diff --git a/examples/sycl/CMakeFiles/progress.marks b/examples/sycl/CMakeFiles/progress.marks
deleted file mode 100644
index 59343b09ec765..0000000000000
--- a/examples/sycl/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-53
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index aee37f6a1d49b..8404704792520 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(TARGET ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
\ No newline at end of file
diff --git a/examples/sycl/Makefile b/examples/sycl/Makefile
deleted file mode 100644
index 8db0f04b1819b..0000000000000
--- a/examples/sycl/Makefile
+++ /dev/null
@@ -1,567 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl//CMakeFiles/progress.marks
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-#=============================================================================
-# Target rules for targets named baby-llama
-
-# Build rule for target.
-baby-llama: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama
-.PHONY : baby-llama
-
-# fast build rule for target.
-baby-llama/fast:
-	$(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/build
-.PHONY : baby-llama/fast
-
-#=============================================================================
-# Target rules for targets named batched
-
-# Build rule for target.
-batched: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched
-.PHONY : batched
-
-# fast build rule for target.
-batched/fast:
-	$(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/build
-.PHONY : batched/fast
-
-#=============================================================================
-# Target rules for targets named batched-bench
-
-# Build rule for target.
-batched-bench: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench
-.PHONY : batched-bench
-
-# fast build rule for target.
-batched-bench/fast:
-	$(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/build
-.PHONY : batched-bench/fast
-
-#=============================================================================
-# Target rules for targets named beam-search
-
-# Build rule for target.
-beam-search: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search
-.PHONY : beam-search
-
-# fast build rule for target.
-beam-search/fast:
-	$(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/build
-.PHONY : beam-search/fast
-
-#=============================================================================
-# Target rules for targets named benchmark
-
-# Build rule for target.
-benchmark: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark
-.PHONY : benchmark
-
-# fast build rule for target.
-benchmark/fast:
-	$(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/build
-.PHONY : benchmark/fast
-
-#=============================================================================
-# Target rules for targets named convert-llama2c-to-ggml
-
-# Build rule for target.
-convert-llama2c-to-ggml: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml
-.PHONY : convert-llama2c-to-ggml
-
-# fast build rule for target.
-convert-llama2c-to-ggml/fast:
-	$(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build
-.PHONY : convert-llama2c-to-ggml/fast
-
-#=============================================================================
-# Target rules for targets named embedding
-
-# Build rule for target.
-embedding: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding
-.PHONY : embedding
-
-# fast build rule for target.
-embedding/fast:
-	$(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/build
-.PHONY : embedding/fast
-
-#=============================================================================
-# Target rules for targets named finetune
-
-# Build rule for target.
-finetune: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune
-.PHONY : finetune
-
-# fast build rule for target.
-finetune/fast:
-	$(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/build
-.PHONY : finetune/fast
-
-#=============================================================================
-# Target rules for targets named infill
-
-# Build rule for target.
-infill: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill
-.PHONY : infill
-
-# fast build rule for target.
-infill/fast:
-	$(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/build
-.PHONY : infill/fast
-
-#=============================================================================
-# Target rules for targets named llama-bench
-
-# Build rule for target.
-llama-bench: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench
-.PHONY : llama-bench
-
-# fast build rule for target.
-llama-bench/fast:
-	$(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/build
-.PHONY : llama-bench/fast
-
-#=============================================================================
-# Target rules for targets named llava
-
-# Build rule for target.
-llava: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava
-.PHONY : llava
-
-# fast build rule for target.
-llava/fast:
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/build
-.PHONY : llava/fast
-
-#=============================================================================
-# Target rules for targets named llava_static
-
-# Build rule for target.
-llava_static: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava_static
-.PHONY : llava_static
-
-# fast build rule for target.
-llava_static/fast:
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/build
-.PHONY : llava_static/fast
-
-#=============================================================================
-# Target rules for targets named llava-cli
-
-# Build rule for target.
-llava-cli: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava-cli
-.PHONY : llava-cli
-
-# fast build rule for target.
-llava-cli/fast:
-	$(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/build
-.PHONY : llava-cli/fast
-
-#=============================================================================
-# Target rules for targets named ls-sycl-device
-
-# Build rule for target.
-ls-sycl-device: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 ls-sycl-device
-.PHONY : ls-sycl-device
-
-# fast build rule for target.
-ls-sycl-device/fast:
-	$(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/build
-.PHONY : ls-sycl-device/fast
-
-#=============================================================================
-# Target rules for targets named main
-
-# Build rule for target.
-main: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main
-.PHONY : main
-
-# fast build rule for target.
-main/fast:
-	$(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/build
-.PHONY : main/fast
-
-#=============================================================================
-# Target rules for targets named tokenize
-
-# Build rule for target.
-tokenize: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize
-.PHONY : tokenize
-
-# fast build rule for target.
-tokenize/fast:
-	$(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/build
-.PHONY : tokenize/fast
-
-#=============================================================================
-# Target rules for targets named parallel
-
-# Build rule for target.
-parallel: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel
-.PHONY : parallel
-
-# fast build rule for target.
-parallel/fast:
-	$(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/build
-.PHONY : parallel/fast
-
-#=============================================================================
-# Target rules for targets named perplexity
-
-# Build rule for target.
-perplexity: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity
-.PHONY : perplexity
-
-# fast build rule for target.
-perplexity/fast:
-	$(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/build
-.PHONY : perplexity/fast
-
-#=============================================================================
-# Target rules for targets named quantize
-
-# Build rule for target.
-quantize: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize
-.PHONY : quantize
-
-# fast build rule for target.
-quantize/fast:
-	$(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/build
-.PHONY : quantize/fast
-
-#=============================================================================
-# Target rules for targets named quantize-stats
-
-# Build rule for target.
-quantize-stats: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats
-.PHONY : quantize-stats
-
-# fast build rule for target.
-quantize-stats/fast:
-	$(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/build
-.PHONY : quantize-stats/fast
-
-#=============================================================================
-# Target rules for targets named save-load-state
-
-# Build rule for target.
-save-load-state: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state
-.PHONY : save-load-state
-
-# fast build rule for target.
-save-load-state/fast:
-	$(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/build
-.PHONY : save-load-state/fast
-
-#=============================================================================
-# Target rules for targets named simple
-
-# Build rule for target.
-simple: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple
-.PHONY : simple
-
-# fast build rule for target.
-simple/fast:
-	$(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/build
-.PHONY : simple/fast
-
-#=============================================================================
-# Target rules for targets named speculative
-
-# Build rule for target.
-speculative: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative
-.PHONY : speculative
-
-# fast build rule for target.
-speculative/fast:
-	$(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/build
-.PHONY : speculative/fast
-
-#=============================================================================
-# Target rules for targets named lookahead
-
-# Build rule for target.
-lookahead: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead
-.PHONY : lookahead
-
-# fast build rule for target.
-lookahead/fast:
-	$(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/build
-.PHONY : lookahead/fast
-
-#=============================================================================
-# Target rules for targets named lookup
-
-# Build rule for target.
-lookup: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup
-.PHONY : lookup
-
-# fast build rule for target.
-lookup/fast:
-	$(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/build
-.PHONY : lookup/fast
-
-#=============================================================================
-# Target rules for targets named train-text-from-scratch
-
-# Build rule for target.
-train-text-from-scratch: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch
-.PHONY : train-text-from-scratch
-
-# fast build rule for target.
-train-text-from-scratch/fast:
-	$(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build
-.PHONY : train-text-from-scratch/fast
-
-#=============================================================================
-# Target rules for targets named export-lora
-
-# Build rule for target.
-export-lora: cmake_check_build_system
-	$(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora
-.PHONY : export-lora
-
-# fast build rule for target.
-export-lora/fast:
-	$(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/build
-.PHONY : export-lora/fast
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... baby-llama"
-	@echo "... batched"
-	@echo "... batched-bench"
-	@echo "... beam-search"
-	@echo "... benchmark"
-	@echo "... convert-llama2c-to-ggml"
-	@echo "... embedding"
-	@echo "... export-lora"
-	@echo "... finetune"
-	@echo "... infill"
-	@echo "... llama-bench"
-	@echo "... llava"
-	@echo "... llava-cli"
-	@echo "... llava_static"
-	@echo "... lookahead"
-	@echo "... lookup"
-	@echo "... ls-sycl-device"
-	@echo "... main"
-	@echo "... parallel"
-	@echo "... perplexity"
-	@echo "... quantize"
-	@echo "... quantize-stats"
-	@echo "... save-load-state"
-	@echo "... simple"
-	@echo "... speculative"
-	@echo "... tokenize"
-	@echo "... train-text-from-scratch"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	$(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake
deleted file mode 100644
index 8a665ff63d034..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama/baby-llama.cpp" "baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o" "gcc" "baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/build.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/build.make
deleted file mode 100644
index 99030e77f7436..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include baby-llama/CMakeFiles/baby-llama.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include baby-llama/CMakeFiles/baby-llama.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include baby-llama/CMakeFiles/baby-llama.dir/flags.make
-
-baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o: baby-llama/CMakeFiles/baby-llama.dir/flags.make
-baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o: ../baby-llama/baby-llama.cpp
-baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o: baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o -MF CMakeFiles/baby-llama.dir/baby-llama.o.d -o CMakeFiles/baby-llama.dir/baby-llama.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama/baby-llama.cpp
-
-baby-llama/CMakeFiles/baby-llama.dir/baby-llama.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/baby-llama.dir/baby-llama.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama/baby-llama.cpp > CMakeFiles/baby-llama.dir/baby-llama.i
-
-baby-llama/CMakeFiles/baby-llama.dir/baby-llama.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/baby-llama.dir/baby-llama.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama/baby-llama.cpp -o CMakeFiles/baby-llama.dir/baby-llama.s
-
-# Object files for target baby-llama
-baby__llama_OBJECTS = \
-"CMakeFiles/baby-llama.dir/baby-llama.o"
-
-# External object files for target baby-llama
-baby__llama_EXTERNAL_OBJECTS =
-
-baby-llama/baby-llama: baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o
-baby-llama/baby-llama: baby-llama/CMakeFiles/baby-llama.dir/build.make
-baby-llama/baby-llama: baby-llama/CMakeFiles/baby-llama.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable baby-llama"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/baby-llama.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-baby-llama/CMakeFiles/baby-llama.dir/build: baby-llama/baby-llama
-.PHONY : baby-llama/CMakeFiles/baby-llama.dir/build
-
-baby-llama/CMakeFiles/baby-llama.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama && $(CMAKE_COMMAND) -P CMakeFiles/baby-llama.dir/cmake_clean.cmake
-.PHONY : baby-llama/CMakeFiles/baby-llama.dir/clean
-
-baby-llama/CMakeFiles/baby-llama.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : baby-llama/CMakeFiles/baby-llama.dir/depend
-
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/cmake_clean.cmake b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/cmake_clean.cmake
deleted file mode 100644
index 6a49649a74e35..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/baby-llama.dir/baby-llama.o"
-  "CMakeFiles/baby-llama.dir/baby-llama.o.d"
-  "baby-llama"
-  "baby-llama.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/baby-llama.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make
deleted file mode 100644
index f8bff7f24a6ad..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for baby-llama.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts
deleted file mode 100644
index 2ec3a9c66baba..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for baby-llama.
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/depend.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/depend.make
deleted file mode 100644
index b8284dc182864..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for baby-llama.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/flags.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/link.txt b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/link.txt
deleted file mode 100644
index c9ea6a9618a17..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/baby-llama.dir/baby-llama.o -o baby-llama  -lcommon -lllama 
diff --git a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/progress.make b/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/progress.make
deleted file mode 100644
index abadeb0c3abaa..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/baby-llama.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 1
-CMAKE_PROGRESS_2 = 2
-
diff --git a/examples/sycl/baby-llama/CMakeFiles/progress.marks b/examples/sycl/baby-llama/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/baby-llama/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/baby-llama/Makefile b/examples/sycl/baby-llama/Makefile
deleted file mode 100644
index 8bf34c6f86aa5..0000000000000
--- a/examples/sycl/baby-llama/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-baby-llama/CMakeFiles/baby-llama.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 baby-llama/CMakeFiles/baby-llama.dir/rule
-.PHONY : baby-llama/CMakeFiles/baby-llama.dir/rule
-
-# Convenience name for target.
-baby-llama: baby-llama/CMakeFiles/baby-llama.dir/rule
-.PHONY : baby-llama
-
-# fast build rule for target.
-baby-llama/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/build
-.PHONY : baby-llama/fast
-
-# target to build an object file
-baby-llama.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/baby-llama.o
-.PHONY : baby-llama.o
-
-# target to preprocess a source file
-baby-llama.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/baby-llama.i
-.PHONY : baby-llama.i
-
-# target to generate assembly for a file
-baby-llama.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f baby-llama/CMakeFiles/baby-llama.dir/build.make baby-llama/CMakeFiles/baby-llama.dir/baby-llama.s
-.PHONY : baby-llama.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... baby-llama"
-	@echo "... baby-llama.o"
-	@echo "... baby-llama.i"
-	@echo "... baby-llama.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/baby-llama/cmake_install.cmake b/examples/sycl/baby-llama/cmake_install.cmake
deleted file mode 100644
index 86ee23ec75026..0000000000000
--- a/examples/sycl/baby-llama/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/baby-llama
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/baby-llama")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/baby-llama")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake
deleted file mode 100644
index 13582c409e5ac..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench/batched-bench.cpp" "batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o" "gcc" "batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/build.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/build.make
deleted file mode 100644
index bf64db49915de..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include batched-bench/CMakeFiles/batched-bench.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include batched-bench/CMakeFiles/batched-bench.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include batched-bench/CMakeFiles/batched-bench.dir/flags.make
-
-batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o: batched-bench/CMakeFiles/batched-bench.dir/flags.make
-batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o: ../batched-bench/batched-bench.cpp
-batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o: batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o -MF CMakeFiles/batched-bench.dir/batched-bench.o.d -o CMakeFiles/batched-bench.dir/batched-bench.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench/batched-bench.cpp
-
-batched-bench/CMakeFiles/batched-bench.dir/batched-bench.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/batched-bench.dir/batched-bench.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench/batched-bench.cpp > CMakeFiles/batched-bench.dir/batched-bench.i
-
-batched-bench/CMakeFiles/batched-bench.dir/batched-bench.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/batched-bench.dir/batched-bench.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench/batched-bench.cpp -o CMakeFiles/batched-bench.dir/batched-bench.s
-
-# Object files for target batched-bench
-batched__bench_OBJECTS = \
-"CMakeFiles/batched-bench.dir/batched-bench.o"
-
-# External object files for target batched-bench
-batched__bench_EXTERNAL_OBJECTS =
-
-batched-bench/batched-bench: batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o
-batched-bench/batched-bench: batched-bench/CMakeFiles/batched-bench.dir/build.make
-batched-bench/batched-bench: batched-bench/CMakeFiles/batched-bench.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable batched-bench"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/batched-bench.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-batched-bench/CMakeFiles/batched-bench.dir/build: batched-bench/batched-bench
-.PHONY : batched-bench/CMakeFiles/batched-bench.dir/build
-
-batched-bench/CMakeFiles/batched-bench.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench && $(CMAKE_COMMAND) -P CMakeFiles/batched-bench.dir/cmake_clean.cmake
-.PHONY : batched-bench/CMakeFiles/batched-bench.dir/clean
-
-batched-bench/CMakeFiles/batched-bench.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : batched-bench/CMakeFiles/batched-bench.dir/depend
-
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/cmake_clean.cmake b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/cmake_clean.cmake
deleted file mode 100644
index 96618af56680c..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/batched-bench.dir/batched-bench.o"
-  "CMakeFiles/batched-bench.dir/batched-bench.o.d"
-  "batched-bench"
-  "batched-bench.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/batched-bench.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make
deleted file mode 100644
index a0119aa940c10..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for batched-bench.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts
deleted file mode 100644
index dce1c2dad71f8..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for batched-bench.
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/depend.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/depend.make
deleted file mode 100644
index 163222cab2552..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for batched-bench.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/flags.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/link.txt b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/link.txt
deleted file mode 100644
index a2173ac6a4c48..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/batched-bench.dir/batched-bench.o -o batched-bench  -lcommon -lllama 
diff --git a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/progress.make b/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/progress.make
deleted file mode 100644
index 3a86673aa7c18..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/batched-bench.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 5
-CMAKE_PROGRESS_2 = 6
-
diff --git a/examples/sycl/batched-bench/CMakeFiles/progress.marks b/examples/sycl/batched-bench/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/batched-bench/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/batched-bench/Makefile b/examples/sycl/batched-bench/Makefile
deleted file mode 100644
index 9643d8cd01572..0000000000000
--- a/examples/sycl/batched-bench/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-batched-bench/CMakeFiles/batched-bench.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched-bench/CMakeFiles/batched-bench.dir/rule
-.PHONY : batched-bench/CMakeFiles/batched-bench.dir/rule
-
-# Convenience name for target.
-batched-bench: batched-bench/CMakeFiles/batched-bench.dir/rule
-.PHONY : batched-bench
-
-# fast build rule for target.
-batched-bench/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/build
-.PHONY : batched-bench/fast
-
-# target to build an object file
-batched-bench.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/batched-bench.o
-.PHONY : batched-bench.o
-
-# target to preprocess a source file
-batched-bench.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/batched-bench.i
-.PHONY : batched-bench.i
-
-# target to generate assembly for a file
-batched-bench.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched-bench/CMakeFiles/batched-bench.dir/build.make batched-bench/CMakeFiles/batched-bench.dir/batched-bench.s
-.PHONY : batched-bench.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... batched-bench"
-	@echo "... batched-bench.o"
-	@echo "... batched-bench.i"
-	@echo "... batched-bench.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/batched-bench/cmake_install.cmake b/examples/sycl/batched-bench/cmake_install.cmake
deleted file mode 100644
index baa6e752d936f..0000000000000
--- a/examples/sycl/batched-bench/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/batched-bench
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/batched-bench")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched-bench")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/batched/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/batched/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/batched/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake b/examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake
deleted file mode 100644
index cfa8cdcb00d4e..0000000000000
--- a/examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/batched/batched.cpp" "batched/CMakeFiles/batched.dir/batched.o" "gcc" "batched/CMakeFiles/batched.dir/batched.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/build.make b/examples/sycl/batched/CMakeFiles/batched.dir/build.make
deleted file mode 100644
index b69a5c8e00476..0000000000000
--- a/examples/sycl/batched/CMakeFiles/batched.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include batched/CMakeFiles/batched.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include batched/CMakeFiles/batched.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include batched/CMakeFiles/batched.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include batched/CMakeFiles/batched.dir/flags.make
-
-batched/CMakeFiles/batched.dir/batched.o: batched/CMakeFiles/batched.dir/flags.make
-batched/CMakeFiles/batched.dir/batched.o: ../batched/batched.cpp
-batched/CMakeFiles/batched.dir/batched.o: batched/CMakeFiles/batched.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object batched/CMakeFiles/batched.dir/batched.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT batched/CMakeFiles/batched.dir/batched.o -MF CMakeFiles/batched.dir/batched.o.d -o CMakeFiles/batched.dir/batched.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/batched/batched.cpp
-
-batched/CMakeFiles/batched.dir/batched.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/batched.dir/batched.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/batched/batched.cpp > CMakeFiles/batched.dir/batched.i
-
-batched/CMakeFiles/batched.dir/batched.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/batched.dir/batched.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/batched/batched.cpp -o CMakeFiles/batched.dir/batched.s
-
-# Object files for target batched
-batched_OBJECTS = \
-"CMakeFiles/batched.dir/batched.o"
-
-# External object files for target batched
-batched_EXTERNAL_OBJECTS =
-
-batched/batched: batched/CMakeFiles/batched.dir/batched.o
-batched/batched: batched/CMakeFiles/batched.dir/build.make
-batched/batched: batched/CMakeFiles/batched.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable batched"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/batched.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-batched/CMakeFiles/batched.dir/build: batched/batched
-.PHONY : batched/CMakeFiles/batched.dir/build
-
-batched/CMakeFiles/batched.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched && $(CMAKE_COMMAND) -P CMakeFiles/batched.dir/cmake_clean.cmake
-.PHONY : batched/CMakeFiles/batched.dir/clean
-
-batched/CMakeFiles/batched.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/batched /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/CMakeFiles/batched.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : batched/CMakeFiles/batched.dir/depend
-
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/cmake_clean.cmake b/examples/sycl/batched/CMakeFiles/batched.dir/cmake_clean.cmake
deleted file mode 100644
index cec64ae00c700..0000000000000
--- a/examples/sycl/batched/CMakeFiles/batched.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/batched.dir/batched.o"
-  "CMakeFiles/batched.dir/batched.o.d"
-  "batched"
-  "batched.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/batched.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.make b/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.make
deleted file mode 100644
index 9a449d3648e98..0000000000000
--- a/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for batched.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.ts b/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.ts
deleted file mode 100644
index 803668d02f618..0000000000000
--- a/examples/sycl/batched/CMakeFiles/batched.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for batched.
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/depend.make b/examples/sycl/batched/CMakeFiles/batched.dir/depend.make
deleted file mode 100644
index c583466f650c2..0000000000000
--- a/examples/sycl/batched/CMakeFiles/batched.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for batched.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/flags.make b/examples/sycl/batched/CMakeFiles/batched.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/batched/CMakeFiles/batched.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/link.txt b/examples/sycl/batched/CMakeFiles/batched.dir/link.txt
deleted file mode 100644
index 26eb25de70ccc..0000000000000
--- a/examples/sycl/batched/CMakeFiles/batched.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/batched.dir/batched.o -o batched  -lcommon -lllama 
diff --git a/examples/sycl/batched/CMakeFiles/batched.dir/progress.make b/examples/sycl/batched/CMakeFiles/batched.dir/progress.make
deleted file mode 100644
index 8c8fb6fbbc138..0000000000000
--- a/examples/sycl/batched/CMakeFiles/batched.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 3
-CMAKE_PROGRESS_2 = 4
-
diff --git a/examples/sycl/batched/CMakeFiles/progress.marks b/examples/sycl/batched/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/batched/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/batched/Makefile b/examples/sycl/batched/Makefile
deleted file mode 100644
index ce954a5513214..0000000000000
--- a/examples/sycl/batched/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-batched/CMakeFiles/batched.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 batched/CMakeFiles/batched.dir/rule
-.PHONY : batched/CMakeFiles/batched.dir/rule
-
-# Convenience name for target.
-batched: batched/CMakeFiles/batched.dir/rule
-.PHONY : batched
-
-# fast build rule for target.
-batched/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/build
-.PHONY : batched/fast
-
-# target to build an object file
-batched.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/batched.o
-.PHONY : batched.o
-
-# target to preprocess a source file
-batched.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/batched.i
-.PHONY : batched.i
-
-# target to generate assembly for a file
-batched.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f batched/CMakeFiles/batched.dir/build.make batched/CMakeFiles/batched.dir/batched.s
-.PHONY : batched.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... batched"
-	@echo "... batched.o"
-	@echo "... batched.i"
-	@echo "... batched.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/batched/cmake_install.cmake b/examples/sycl/batched/cmake_install.cmake
deleted file mode 100644
index e8c0214cfe960..0000000000000
--- a/examples/sycl/batched/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/batched
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/batched")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/batched")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/beam-search/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/beam-search/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake
deleted file mode 100644
index 7822f7a7ca873..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/beam-search/beam-search.cpp" "beam-search/CMakeFiles/beam-search.dir/beam-search.o" "gcc" "beam-search/CMakeFiles/beam-search.dir/beam-search.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/build.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/build.make
deleted file mode 100644
index e462d1b34af33..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include beam-search/CMakeFiles/beam-search.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include beam-search/CMakeFiles/beam-search.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include beam-search/CMakeFiles/beam-search.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include beam-search/CMakeFiles/beam-search.dir/flags.make
-
-beam-search/CMakeFiles/beam-search.dir/beam-search.o: beam-search/CMakeFiles/beam-search.dir/flags.make
-beam-search/CMakeFiles/beam-search.dir/beam-search.o: ../beam-search/beam-search.cpp
-beam-search/CMakeFiles/beam-search.dir/beam-search.o: beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object beam-search/CMakeFiles/beam-search.dir/beam-search.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT beam-search/CMakeFiles/beam-search.dir/beam-search.o -MF CMakeFiles/beam-search.dir/beam-search.o.d -o CMakeFiles/beam-search.dir/beam-search.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search/beam-search.cpp
-
-beam-search/CMakeFiles/beam-search.dir/beam-search.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/beam-search.dir/beam-search.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search/beam-search.cpp > CMakeFiles/beam-search.dir/beam-search.i
-
-beam-search/CMakeFiles/beam-search.dir/beam-search.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/beam-search.dir/beam-search.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search/beam-search.cpp -o CMakeFiles/beam-search.dir/beam-search.s
-
-# Object files for target beam-search
-beam__search_OBJECTS = \
-"CMakeFiles/beam-search.dir/beam-search.o"
-
-# External object files for target beam-search
-beam__search_EXTERNAL_OBJECTS =
-
-beam-search/beam-search: beam-search/CMakeFiles/beam-search.dir/beam-search.o
-beam-search/beam-search: beam-search/CMakeFiles/beam-search.dir/build.make
-beam-search/beam-search: beam-search/CMakeFiles/beam-search.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable beam-search"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/beam-search.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-beam-search/CMakeFiles/beam-search.dir/build: beam-search/beam-search
-.PHONY : beam-search/CMakeFiles/beam-search.dir/build
-
-beam-search/CMakeFiles/beam-search.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search && $(CMAKE_COMMAND) -P CMakeFiles/beam-search.dir/cmake_clean.cmake
-.PHONY : beam-search/CMakeFiles/beam-search.dir/clean
-
-beam-search/CMakeFiles/beam-search.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/CMakeFiles/beam-search.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : beam-search/CMakeFiles/beam-search.dir/depend
-
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/cmake_clean.cmake b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/cmake_clean.cmake
deleted file mode 100644
index e28d241a266ec..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/beam-search.dir/beam-search.o"
-  "CMakeFiles/beam-search.dir/beam-search.o.d"
-  "beam-search"
-  "beam-search.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/beam-search.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.make
deleted file mode 100644
index 7b9ac25582ada..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for beam-search.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts
deleted file mode 100644
index 1117fb6f80e48..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for beam-search.
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/depend.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/depend.make
deleted file mode 100644
index 37ec7a3f8206c..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for beam-search.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/flags.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/link.txt b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/link.txt
deleted file mode 100644
index 1acfe16a7a283..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/beam-search.dir/beam-search.o -o beam-search  -lcommon -lllama 
diff --git a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/progress.make b/examples/sycl/beam-search/CMakeFiles/beam-search.dir/progress.make
deleted file mode 100644
index 72bb7dd025afc..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/beam-search.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 7
-CMAKE_PROGRESS_2 = 8
-
diff --git a/examples/sycl/beam-search/CMakeFiles/progress.marks b/examples/sycl/beam-search/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/beam-search/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/beam-search/Makefile b/examples/sycl/beam-search/Makefile
deleted file mode 100644
index a1a902339db93..0000000000000
--- a/examples/sycl/beam-search/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-beam-search/CMakeFiles/beam-search.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 beam-search/CMakeFiles/beam-search.dir/rule
-.PHONY : beam-search/CMakeFiles/beam-search.dir/rule
-
-# Convenience name for target.
-beam-search: beam-search/CMakeFiles/beam-search.dir/rule
-.PHONY : beam-search
-
-# fast build rule for target.
-beam-search/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/build
-.PHONY : beam-search/fast
-
-# target to build an object file
-beam-search.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/beam-search.o
-.PHONY : beam-search.o
-
-# target to preprocess a source file
-beam-search.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/beam-search.i
-.PHONY : beam-search.i
-
-# target to generate assembly for a file
-beam-search.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f beam-search/CMakeFiles/beam-search.dir/build.make beam-search/CMakeFiles/beam-search.dir/beam-search.s
-.PHONY : beam-search.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... beam-search"
-	@echo "... beam-search.o"
-	@echo "... beam-search.i"
-	@echo "... beam-search.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/beam-search/cmake_install.cmake b/examples/sycl/beam-search/cmake_install.cmake
deleted file mode 100644
index 33cf7cf46a32a..0000000000000
--- a/examples/sycl/beam-search/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/beam-search
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/beam-search")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/beam-search")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/benchmark/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/benchmark/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake
deleted file mode 100644
index 25e6406f3ed78..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/benchmark-matmult.cpp" "benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o" "gcc" "benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/build.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/build.make
deleted file mode 100644
index 84fb06db47923..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include benchmark/CMakeFiles/benchmark.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include benchmark/CMakeFiles/benchmark.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include benchmark/CMakeFiles/benchmark.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include benchmark/CMakeFiles/benchmark.dir/flags.make
-
-benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o: benchmark/CMakeFiles/benchmark.dir/flags.make
-benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o: ../benchmark/benchmark-matmult.cpp
-benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o: benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o -MF CMakeFiles/benchmark.dir/benchmark-matmult.o.d -o CMakeFiles/benchmark.dir/benchmark-matmult.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/benchmark-matmult.cpp
-
-benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/benchmark.dir/benchmark-matmult.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/benchmark-matmult.cpp > CMakeFiles/benchmark.dir/benchmark-matmult.i
-
-benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/benchmark.dir/benchmark-matmult.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/benchmark-matmult.cpp -o CMakeFiles/benchmark.dir/benchmark-matmult.s
-
-# Object files for target benchmark
-benchmark_OBJECTS = \
-"CMakeFiles/benchmark.dir/benchmark-matmult.o"
-
-# External object files for target benchmark
-benchmark_EXTERNAL_OBJECTS =
-
-benchmark/benchmark: benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o
-benchmark/benchmark: benchmark/CMakeFiles/benchmark.dir/build.make
-benchmark/benchmark: benchmark/CMakeFiles/benchmark.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable benchmark"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/benchmark.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-benchmark/CMakeFiles/benchmark.dir/build: benchmark/benchmark
-.PHONY : benchmark/CMakeFiles/benchmark.dir/build
-
-benchmark/CMakeFiles/benchmark.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark && $(CMAKE_COMMAND) -P CMakeFiles/benchmark.dir/cmake_clean.cmake
-.PHONY : benchmark/CMakeFiles/benchmark.dir/clean
-
-benchmark/CMakeFiles/benchmark.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/CMakeFiles/benchmark.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : benchmark/CMakeFiles/benchmark.dir/depend
-
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/cmake_clean.cmake b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/cmake_clean.cmake
deleted file mode 100644
index 870df51f87439..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/benchmark.dir/benchmark-matmult.o"
-  "CMakeFiles/benchmark.dir/benchmark-matmult.o.d"
-  "benchmark"
-  "benchmark.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/benchmark.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.make
deleted file mode 100644
index 871e0bce58355..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for benchmark.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts
deleted file mode 100644
index 66ee2110c21b7..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for benchmark.
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/depend.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/depend.make
deleted file mode 100644
index 71cca2eeae086..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for benchmark.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/flags.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/flags.make
deleted file mode 100644
index d799cee981f62..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/benchmark/../../common
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/link.txt b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/link.txt
deleted file mode 100644
index f5f38e36cd745..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/benchmark.dir/benchmark-matmult.o -o benchmark  -lllama -lbuild_info 
diff --git a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/progress.make b/examples/sycl/benchmark/CMakeFiles/benchmark.dir/progress.make
deleted file mode 100644
index b700c2c902219..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/benchmark.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 9
-CMAKE_PROGRESS_2 = 10
-
diff --git a/examples/sycl/benchmark/CMakeFiles/progress.marks b/examples/sycl/benchmark/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/benchmark/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/benchmark/Makefile b/examples/sycl/benchmark/Makefile
deleted file mode 100644
index 68795570edc0b..0000000000000
--- a/examples/sycl/benchmark/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-benchmark/CMakeFiles/benchmark.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 benchmark/CMakeFiles/benchmark.dir/rule
-.PHONY : benchmark/CMakeFiles/benchmark.dir/rule
-
-# Convenience name for target.
-benchmark: benchmark/CMakeFiles/benchmark.dir/rule
-.PHONY : benchmark
-
-# fast build rule for target.
-benchmark/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/build
-.PHONY : benchmark/fast
-
-# target to build an object file
-benchmark-matmult.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.o
-.PHONY : benchmark-matmult.o
-
-# target to preprocess a source file
-benchmark-matmult.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.i
-.PHONY : benchmark-matmult.i
-
-# target to generate assembly for a file
-benchmark-matmult.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f benchmark/CMakeFiles/benchmark.dir/build.make benchmark/CMakeFiles/benchmark.dir/benchmark-matmult.s
-.PHONY : benchmark-matmult.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... benchmark"
-	@echo "... benchmark-matmult.o"
-	@echo "... benchmark-matmult.i"
-	@echo "... benchmark-matmult.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/benchmark/cmake_install.cmake b/examples/sycl/benchmark/cmake_install.cmake
deleted file mode 100644
index d2a89c99230ff..0000000000000
--- a/examples/sycl/benchmark/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/benchmark
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/benchmark")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/benchmark")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/cmake_install.cmake b/examples/sycl/cmake_install.cmake
deleted file mode 100644
index e365c411dabd6..0000000000000
--- a/examples/sycl/cmake_install.cmake
+++ /dev/null
@@ -1,84 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if(NOT CMAKE_INSTALL_LOCAL_ONLY)
-  # Include the install script for each subdirectory.
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/baby-llama/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/batched-bench/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/beam-search/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/benchmark/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/cmake_install.cmake")
-  include("/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/cmake_install.cmake")
-
-endif()
-
-if(CMAKE_INSTALL_COMPONENT)
-  set(CMAKE_INSTALL_MANIFEST "install_manifest_${CMAKE_INSTALL_COMPONENT}.txt")
-else()
-  set(CMAKE_INSTALL_MANIFEST "install_manifest.txt")
-endif()
-
-string(REPLACE ";" "\n" CMAKE_INSTALL_MANIFEST_CONTENT
-       "${CMAKE_INSTALL_MANIFEST_FILES}")
-file(WRITE "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/${CMAKE_INSTALL_MANIFEST}"
-     "${CMAKE_INSTALL_MANIFEST_CONTENT}")
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake
deleted file mode 100644
index 54c704ca3ca01..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp" "convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o" "gcc" "convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make
deleted file mode 100644
index ee8cc8e191ff9..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
-
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o: ../convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o -MF CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o.d -o CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
-
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp > CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.i
-
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp -o CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.s
-
-# Object files for target convert-llama2c-to-ggml
-convert__llama2c__to__ggml_OBJECTS = \
-"CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o"
-
-# External object files for target convert-llama2c-to-ggml
-convert__llama2c__to__ggml_EXTERNAL_OBJECTS =
-
-convert-llama2c-to-ggml/convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o
-convert-llama2c-to-ggml/convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make
-convert-llama2c-to-ggml/convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable convert-llama2c-to-ggml"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/convert-llama2c-to-ggml.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build: convert-llama2c-to-ggml/convert-llama2c-to-ggml
-.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build
-
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml && $(CMAKE_COMMAND) -P CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake
-.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/clean
-
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend
-
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake
deleted file mode 100644
index 27a9734b51264..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o"
-  "CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o.d"
-  "convert-llama2c-to-ggml"
-  "convert-llama2c-to-ggml.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/convert-llama2c-to-ggml.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make
deleted file mode 100644
index 0c7de424c1697..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for convert-llama2c-to-ggml.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts
deleted file mode 100644
index 4db962ade5dfe..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for convert-llama2c-to-ggml.
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make
deleted file mode 100644
index 7a48cf562a9dc..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for convert-llama2c-to-ggml.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt
deleted file mode 100644
index fadbf2651cb1d..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o -o convert-llama2c-to-ggml  -lcommon -lllama 
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make
deleted file mode 100644
index 596289c0fd56a..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 11
-CMAKE_PROGRESS_2 = 12
-
diff --git a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/progress.marks b/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/convert-llama2c-to-ggml/Makefile b/examples/sycl/convert-llama2c-to-ggml/Makefile
deleted file mode 100644
index 37c3f05fd314d..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
-.PHONY : convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
-
-# Convenience name for target.
-convert-llama2c-to-ggml: convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/rule
-.PHONY : convert-llama2c-to-ggml
-
-# fast build rule for target.
-convert-llama2c-to-ggml/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build
-.PHONY : convert-llama2c-to-ggml/fast
-
-# target to build an object file
-convert-llama2c-to-ggml.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.o
-.PHONY : convert-llama2c-to-ggml.o
-
-# target to preprocess a source file
-convert-llama2c-to-ggml.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.i
-.PHONY : convert-llama2c-to-ggml.i
-
-# target to generate assembly for a file
-convert-llama2c-to-ggml.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/build.make convert-llama2c-to-ggml/CMakeFiles/convert-llama2c-to-ggml.dir/convert-llama2c-to-ggml.s
-.PHONY : convert-llama2c-to-ggml.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... convert-llama2c-to-ggml"
-	@echo "... convert-llama2c-to-ggml.o"
-	@echo "... convert-llama2c-to-ggml.i"
-	@echo "... convert-llama2c-to-ggml.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake b/examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake
deleted file mode 100644
index 4e812cd016981..0000000000000
--- a/examples/sycl/convert-llama2c-to-ggml/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/convert-llama2c-to-ggml
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/convert-llama2c-to-ggml/convert-llama2c-to-ggml")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/convert-llama2c-to-ggml")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/embedding/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/embedding/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake b/examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake
deleted file mode 100644
index 212c12d98822f..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/embedding/embedding.cpp" "embedding/CMakeFiles/embedding.dir/embedding.o" "gcc" "embedding/CMakeFiles/embedding.dir/embedding.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/build.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/build.make
deleted file mode 100644
index a5788eed19ffa..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/embedding.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include embedding/CMakeFiles/embedding.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include embedding/CMakeFiles/embedding.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include embedding/CMakeFiles/embedding.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include embedding/CMakeFiles/embedding.dir/flags.make
-
-embedding/CMakeFiles/embedding.dir/embedding.o: embedding/CMakeFiles/embedding.dir/flags.make
-embedding/CMakeFiles/embedding.dir/embedding.o: ../embedding/embedding.cpp
-embedding/CMakeFiles/embedding.dir/embedding.o: embedding/CMakeFiles/embedding.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object embedding/CMakeFiles/embedding.dir/embedding.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT embedding/CMakeFiles/embedding.dir/embedding.o -MF CMakeFiles/embedding.dir/embedding.o.d -o CMakeFiles/embedding.dir/embedding.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/embedding/embedding.cpp
-
-embedding/CMakeFiles/embedding.dir/embedding.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/embedding.dir/embedding.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/embedding/embedding.cpp > CMakeFiles/embedding.dir/embedding.i
-
-embedding/CMakeFiles/embedding.dir/embedding.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/embedding.dir/embedding.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/embedding/embedding.cpp -o CMakeFiles/embedding.dir/embedding.s
-
-# Object files for target embedding
-embedding_OBJECTS = \
-"CMakeFiles/embedding.dir/embedding.o"
-
-# External object files for target embedding
-embedding_EXTERNAL_OBJECTS =
-
-embedding/embedding: embedding/CMakeFiles/embedding.dir/embedding.o
-embedding/embedding: embedding/CMakeFiles/embedding.dir/build.make
-embedding/embedding: embedding/CMakeFiles/embedding.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable embedding"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/embedding.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-embedding/CMakeFiles/embedding.dir/build: embedding/embedding
-.PHONY : embedding/CMakeFiles/embedding.dir/build
-
-embedding/CMakeFiles/embedding.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding && $(CMAKE_COMMAND) -P CMakeFiles/embedding.dir/cmake_clean.cmake
-.PHONY : embedding/CMakeFiles/embedding.dir/clean
-
-embedding/CMakeFiles/embedding.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/embedding /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/CMakeFiles/embedding.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : embedding/CMakeFiles/embedding.dir/depend
-
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/cmake_clean.cmake b/examples/sycl/embedding/CMakeFiles/embedding.dir/cmake_clean.cmake
deleted file mode 100644
index e6d17db5dfb54..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/embedding.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/embedding.dir/embedding.o"
-  "CMakeFiles/embedding.dir/embedding.o.d"
-  "embedding"
-  "embedding.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/embedding.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.make
deleted file mode 100644
index 51d484f9f0010..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for embedding.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.ts b/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.ts
deleted file mode 100644
index af47aa36d337d..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/embedding.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for embedding.
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/depend.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/depend.make
deleted file mode 100644
index 12b2e39ad1bd4..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/embedding.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for embedding.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/flags.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/embedding.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/link.txt b/examples/sycl/embedding/CMakeFiles/embedding.dir/link.txt
deleted file mode 100644
index 7d5713ba926ec..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/embedding.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/embedding.dir/embedding.o -o embedding  -lcommon -lllama 
diff --git a/examples/sycl/embedding/CMakeFiles/embedding.dir/progress.make b/examples/sycl/embedding/CMakeFiles/embedding.dir/progress.make
deleted file mode 100644
index d92f75a2935ea..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/embedding.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 13
-CMAKE_PROGRESS_2 = 14
-
diff --git a/examples/sycl/embedding/CMakeFiles/progress.marks b/examples/sycl/embedding/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/embedding/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/embedding/Makefile b/examples/sycl/embedding/Makefile
deleted file mode 100644
index 588a49a8da14d..0000000000000
--- a/examples/sycl/embedding/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-embedding/CMakeFiles/embedding.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 embedding/CMakeFiles/embedding.dir/rule
-.PHONY : embedding/CMakeFiles/embedding.dir/rule
-
-# Convenience name for target.
-embedding: embedding/CMakeFiles/embedding.dir/rule
-.PHONY : embedding
-
-# fast build rule for target.
-embedding/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/build
-.PHONY : embedding/fast
-
-# target to build an object file
-embedding.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/embedding.o
-.PHONY : embedding.o
-
-# target to preprocess a source file
-embedding.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/embedding.i
-.PHONY : embedding.i
-
-# target to generate assembly for a file
-embedding.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f embedding/CMakeFiles/embedding.dir/build.make embedding/CMakeFiles/embedding.dir/embedding.s
-.PHONY : embedding.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... embedding"
-	@echo "... embedding.o"
-	@echo "... embedding.i"
-	@echo "... embedding.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/embedding/cmake_install.cmake b/examples/sycl/embedding/cmake_install.cmake
deleted file mode 100644
index 1ed94ada9372d..0000000000000
--- a/examples/sycl/embedding/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/embedding
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/embedding/embedding")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/embedding")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/export-lora/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/export-lora/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake
deleted file mode 100644
index 509dd9fd84d2f..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/export-lora/export-lora.cpp" "export-lora/CMakeFiles/export-lora.dir/export-lora.o" "gcc" "export-lora/CMakeFiles/export-lora.dir/export-lora.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/build.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/build.make
deleted file mode 100644
index a30d776294059..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include export-lora/CMakeFiles/export-lora.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include export-lora/CMakeFiles/export-lora.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include export-lora/CMakeFiles/export-lora.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include export-lora/CMakeFiles/export-lora.dir/flags.make
-
-export-lora/CMakeFiles/export-lora.dir/export-lora.o: export-lora/CMakeFiles/export-lora.dir/flags.make
-export-lora/CMakeFiles/export-lora.dir/export-lora.o: ../export-lora/export-lora.cpp
-export-lora/CMakeFiles/export-lora.dir/export-lora.o: export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object export-lora/CMakeFiles/export-lora.dir/export-lora.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT export-lora/CMakeFiles/export-lora.dir/export-lora.o -MF CMakeFiles/export-lora.dir/export-lora.o.d -o CMakeFiles/export-lora.dir/export-lora.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora/export-lora.cpp
-
-export-lora/CMakeFiles/export-lora.dir/export-lora.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/export-lora.dir/export-lora.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora/export-lora.cpp > CMakeFiles/export-lora.dir/export-lora.i
-
-export-lora/CMakeFiles/export-lora.dir/export-lora.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/export-lora.dir/export-lora.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora/export-lora.cpp -o CMakeFiles/export-lora.dir/export-lora.s
-
-# Object files for target export-lora
-export__lora_OBJECTS = \
-"CMakeFiles/export-lora.dir/export-lora.o"
-
-# External object files for target export-lora
-export__lora_EXTERNAL_OBJECTS =
-
-export-lora/export-lora: export-lora/CMakeFiles/export-lora.dir/export-lora.o
-export-lora/export-lora: export-lora/CMakeFiles/export-lora.dir/build.make
-export-lora/export-lora: export-lora/CMakeFiles/export-lora.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable export-lora"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/export-lora.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-export-lora/CMakeFiles/export-lora.dir/build: export-lora/export-lora
-.PHONY : export-lora/CMakeFiles/export-lora.dir/build
-
-export-lora/CMakeFiles/export-lora.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora && $(CMAKE_COMMAND) -P CMakeFiles/export-lora.dir/cmake_clean.cmake
-.PHONY : export-lora/CMakeFiles/export-lora.dir/clean
-
-export-lora/CMakeFiles/export-lora.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/CMakeFiles/export-lora.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : export-lora/CMakeFiles/export-lora.dir/depend
-
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/cmake_clean.cmake b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/cmake_clean.cmake
deleted file mode 100644
index e4439996f1cc8..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/export-lora.dir/export-lora.o"
-  "CMakeFiles/export-lora.dir/export-lora.o.d"
-  "export-lora"
-  "export-lora.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/export-lora.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.make
deleted file mode 100644
index bd6b5614f8f24..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for export-lora.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts
deleted file mode 100644
index 060cd6097e48a..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for export-lora.
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/depend.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/depend.make
deleted file mode 100644
index 98a9b14434306..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for export-lora.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/flags.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/link.txt b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/link.txt
deleted file mode 100644
index 17cb92024e35d..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/export-lora.dir/export-lora.o -o export-lora  -lcommon -lllama 
diff --git a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/progress.make b/examples/sycl/export-lora/CMakeFiles/export-lora.dir/progress.make
deleted file mode 100644
index a35c33b98d591..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/export-lora.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 15
-CMAKE_PROGRESS_2 = 16
-
diff --git a/examples/sycl/export-lora/CMakeFiles/progress.marks b/examples/sycl/export-lora/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/export-lora/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/export-lora/Makefile b/examples/sycl/export-lora/Makefile
deleted file mode 100644
index 81a26eebe5704..0000000000000
--- a/examples/sycl/export-lora/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-export-lora/CMakeFiles/export-lora.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 export-lora/CMakeFiles/export-lora.dir/rule
-.PHONY : export-lora/CMakeFiles/export-lora.dir/rule
-
-# Convenience name for target.
-export-lora: export-lora/CMakeFiles/export-lora.dir/rule
-.PHONY : export-lora
-
-# fast build rule for target.
-export-lora/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/build
-.PHONY : export-lora/fast
-
-# target to build an object file
-export-lora.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/export-lora.o
-.PHONY : export-lora.o
-
-# target to preprocess a source file
-export-lora.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/export-lora.i
-.PHONY : export-lora.i
-
-# target to generate assembly for a file
-export-lora.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f export-lora/CMakeFiles/export-lora.dir/build.make export-lora/CMakeFiles/export-lora.dir/export-lora.s
-.PHONY : export-lora.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... export-lora"
-	@echo "... export-lora.o"
-	@echo "... export-lora.i"
-	@echo "... export-lora.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/export-lora/cmake_install.cmake b/examples/sycl/export-lora/cmake_install.cmake
deleted file mode 100644
index 82df354c0669e..0000000000000
--- a/examples/sycl/export-lora/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/export-lora
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/export-lora/export-lora")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/export-lora")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/finetune/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/finetune/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake b/examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake
deleted file mode 100644
index 1c4b740826406..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/finetune/finetune.cpp" "finetune/CMakeFiles/finetune.dir/finetune.o" "gcc" "finetune/CMakeFiles/finetune.dir/finetune.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/build.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/build.make
deleted file mode 100644
index 87eb77fdf4f6e..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/finetune.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include finetune/CMakeFiles/finetune.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include finetune/CMakeFiles/finetune.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include finetune/CMakeFiles/finetune.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include finetune/CMakeFiles/finetune.dir/flags.make
-
-finetune/CMakeFiles/finetune.dir/finetune.o: finetune/CMakeFiles/finetune.dir/flags.make
-finetune/CMakeFiles/finetune.dir/finetune.o: ../finetune/finetune.cpp
-finetune/CMakeFiles/finetune.dir/finetune.o: finetune/CMakeFiles/finetune.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object finetune/CMakeFiles/finetune.dir/finetune.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT finetune/CMakeFiles/finetune.dir/finetune.o -MF CMakeFiles/finetune.dir/finetune.o.d -o CMakeFiles/finetune.dir/finetune.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/finetune/finetune.cpp
-
-finetune/CMakeFiles/finetune.dir/finetune.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/finetune.dir/finetune.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/finetune/finetune.cpp > CMakeFiles/finetune.dir/finetune.i
-
-finetune/CMakeFiles/finetune.dir/finetune.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/finetune.dir/finetune.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/finetune/finetune.cpp -o CMakeFiles/finetune.dir/finetune.s
-
-# Object files for target finetune
-finetune_OBJECTS = \
-"CMakeFiles/finetune.dir/finetune.o"
-
-# External object files for target finetune
-finetune_EXTERNAL_OBJECTS =
-
-finetune/finetune: finetune/CMakeFiles/finetune.dir/finetune.o
-finetune/finetune: finetune/CMakeFiles/finetune.dir/build.make
-finetune/finetune: finetune/CMakeFiles/finetune.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable finetune"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/finetune.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-finetune/CMakeFiles/finetune.dir/build: finetune/finetune
-.PHONY : finetune/CMakeFiles/finetune.dir/build
-
-finetune/CMakeFiles/finetune.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune && $(CMAKE_COMMAND) -P CMakeFiles/finetune.dir/cmake_clean.cmake
-.PHONY : finetune/CMakeFiles/finetune.dir/clean
-
-finetune/CMakeFiles/finetune.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/finetune /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/CMakeFiles/finetune.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : finetune/CMakeFiles/finetune.dir/depend
-
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/cmake_clean.cmake b/examples/sycl/finetune/CMakeFiles/finetune.dir/cmake_clean.cmake
deleted file mode 100644
index ddc16e11e3bb0..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/finetune.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/finetune.dir/finetune.o"
-  "CMakeFiles/finetune.dir/finetune.o.d"
-  "finetune"
-  "finetune.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/finetune.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.make
deleted file mode 100644
index 7b0c0a165e815..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for finetune.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.ts b/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.ts
deleted file mode 100644
index 60ac2baeed3ac..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/finetune.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for finetune.
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/depend.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/depend.make
deleted file mode 100644
index 69e9f67763a99..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/finetune.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for finetune.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/flags.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/finetune.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/link.txt b/examples/sycl/finetune/CMakeFiles/finetune.dir/link.txt
deleted file mode 100644
index c93473a88cc6a..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/finetune.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/finetune.dir/finetune.o -o finetune  -lcommon -lllama 
diff --git a/examples/sycl/finetune/CMakeFiles/finetune.dir/progress.make b/examples/sycl/finetune/CMakeFiles/finetune.dir/progress.make
deleted file mode 100644
index 5a7451db601a4..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/finetune.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 17
-CMAKE_PROGRESS_2 = 18
-
diff --git a/examples/sycl/finetune/CMakeFiles/progress.marks b/examples/sycl/finetune/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/finetune/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/finetune/Makefile b/examples/sycl/finetune/Makefile
deleted file mode 100644
index 455898767ff8a..0000000000000
--- a/examples/sycl/finetune/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-finetune/CMakeFiles/finetune.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 finetune/CMakeFiles/finetune.dir/rule
-.PHONY : finetune/CMakeFiles/finetune.dir/rule
-
-# Convenience name for target.
-finetune: finetune/CMakeFiles/finetune.dir/rule
-.PHONY : finetune
-
-# fast build rule for target.
-finetune/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/build
-.PHONY : finetune/fast
-
-# target to build an object file
-finetune.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/finetune.o
-.PHONY : finetune.o
-
-# target to preprocess a source file
-finetune.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/finetune.i
-.PHONY : finetune.i
-
-# target to generate assembly for a file
-finetune.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f finetune/CMakeFiles/finetune.dir/build.make finetune/CMakeFiles/finetune.dir/finetune.s
-.PHONY : finetune.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... finetune"
-	@echo "... finetune.o"
-	@echo "... finetune.i"
-	@echo "... finetune.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/finetune/cmake_install.cmake b/examples/sycl/finetune/cmake_install.cmake
deleted file mode 100644
index 6ec05e0579313..0000000000000
--- a/examples/sycl/finetune/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/finetune
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/finetune/finetune")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/finetune")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/infill/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/infill/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/infill/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake b/examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake
deleted file mode 100644
index d2945cc856f9e..0000000000000
--- a/examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/infill/infill.cpp" "infill/CMakeFiles/infill.dir/infill.o" "gcc" "infill/CMakeFiles/infill.dir/infill.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/build.make b/examples/sycl/infill/CMakeFiles/infill.dir/build.make
deleted file mode 100644
index 053f874a44cfb..0000000000000
--- a/examples/sycl/infill/CMakeFiles/infill.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include infill/CMakeFiles/infill.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include infill/CMakeFiles/infill.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include infill/CMakeFiles/infill.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include infill/CMakeFiles/infill.dir/flags.make
-
-infill/CMakeFiles/infill.dir/infill.o: infill/CMakeFiles/infill.dir/flags.make
-infill/CMakeFiles/infill.dir/infill.o: ../infill/infill.cpp
-infill/CMakeFiles/infill.dir/infill.o: infill/CMakeFiles/infill.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object infill/CMakeFiles/infill.dir/infill.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT infill/CMakeFiles/infill.dir/infill.o -MF CMakeFiles/infill.dir/infill.o.d -o CMakeFiles/infill.dir/infill.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/infill/infill.cpp
-
-infill/CMakeFiles/infill.dir/infill.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/infill.dir/infill.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/infill/infill.cpp > CMakeFiles/infill.dir/infill.i
-
-infill/CMakeFiles/infill.dir/infill.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/infill.dir/infill.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/infill/infill.cpp -o CMakeFiles/infill.dir/infill.s
-
-# Object files for target infill
-infill_OBJECTS = \
-"CMakeFiles/infill.dir/infill.o"
-
-# External object files for target infill
-infill_EXTERNAL_OBJECTS =
-
-infill/infill: infill/CMakeFiles/infill.dir/infill.o
-infill/infill: infill/CMakeFiles/infill.dir/build.make
-infill/infill: infill/CMakeFiles/infill.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable infill"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/infill.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-infill/CMakeFiles/infill.dir/build: infill/infill
-.PHONY : infill/CMakeFiles/infill.dir/build
-
-infill/CMakeFiles/infill.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill && $(CMAKE_COMMAND) -P CMakeFiles/infill.dir/cmake_clean.cmake
-.PHONY : infill/CMakeFiles/infill.dir/clean
-
-infill/CMakeFiles/infill.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/infill /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/CMakeFiles/infill.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : infill/CMakeFiles/infill.dir/depend
-
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/cmake_clean.cmake b/examples/sycl/infill/CMakeFiles/infill.dir/cmake_clean.cmake
deleted file mode 100644
index 87dd3a112b239..0000000000000
--- a/examples/sycl/infill/CMakeFiles/infill.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/infill.dir/infill.o"
-  "CMakeFiles/infill.dir/infill.o.d"
-  "infill"
-  "infill.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/infill.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.make b/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.make
deleted file mode 100644
index b1ec2da73449c..0000000000000
--- a/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for infill.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.ts b/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.ts
deleted file mode 100644
index c8209ec18aa58..0000000000000
--- a/examples/sycl/infill/CMakeFiles/infill.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for infill.
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/depend.make b/examples/sycl/infill/CMakeFiles/infill.dir/depend.make
deleted file mode 100644
index 7baa2f32944b2..0000000000000
--- a/examples/sycl/infill/CMakeFiles/infill.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for infill.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/flags.make b/examples/sycl/infill/CMakeFiles/infill.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/infill/CMakeFiles/infill.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/link.txt b/examples/sycl/infill/CMakeFiles/infill.dir/link.txt
deleted file mode 100644
index 591c34ed36c6f..0000000000000
--- a/examples/sycl/infill/CMakeFiles/infill.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/infill.dir/infill.o -o infill  -lcommon -lllama 
diff --git a/examples/sycl/infill/CMakeFiles/infill.dir/progress.make b/examples/sycl/infill/CMakeFiles/infill.dir/progress.make
deleted file mode 100644
index 48b3d8a54961a..0000000000000
--- a/examples/sycl/infill/CMakeFiles/infill.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 19
-CMAKE_PROGRESS_2 = 20
-
diff --git a/examples/sycl/infill/CMakeFiles/progress.marks b/examples/sycl/infill/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/infill/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/infill/Makefile b/examples/sycl/infill/Makefile
deleted file mode 100644
index f380cf65e6242..0000000000000
--- a/examples/sycl/infill/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-infill/CMakeFiles/infill.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 infill/CMakeFiles/infill.dir/rule
-.PHONY : infill/CMakeFiles/infill.dir/rule
-
-# Convenience name for target.
-infill: infill/CMakeFiles/infill.dir/rule
-.PHONY : infill
-
-# fast build rule for target.
-infill/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/build
-.PHONY : infill/fast
-
-# target to build an object file
-infill.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/infill.o
-.PHONY : infill.o
-
-# target to preprocess a source file
-infill.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/infill.i
-.PHONY : infill.i
-
-# target to generate assembly for a file
-infill.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f infill/CMakeFiles/infill.dir/build.make infill/CMakeFiles/infill.dir/infill.s
-.PHONY : infill.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... infill"
-	@echo "... infill.o"
-	@echo "... infill.i"
-	@echo "... infill.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/infill/cmake_install.cmake b/examples/sycl/infill/cmake_install.cmake
deleted file mode 100644
index 75eebb36e8919..0000000000000
--- a/examples/sycl/infill/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/infill
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/infill/infill")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/infill")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake
deleted file mode 100644
index 3474b7e651b40..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench/llama-bench.cpp" "llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o" "gcc" "llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/build.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/build.make
deleted file mode 100644
index f5e7cbbbe44f7..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include llama-bench/CMakeFiles/llama-bench.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include llama-bench/CMakeFiles/llama-bench.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include llama-bench/CMakeFiles/llama-bench.dir/flags.make
-
-llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o: llama-bench/CMakeFiles/llama-bench.dir/flags.make
-llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o: ../llama-bench/llama-bench.cpp
-llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o: llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o -MF CMakeFiles/llama-bench.dir/llama-bench.o.d -o CMakeFiles/llama-bench.dir/llama-bench.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench/llama-bench.cpp
-
-llama-bench/CMakeFiles/llama-bench.dir/llama-bench.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/llama-bench.dir/llama-bench.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench/llama-bench.cpp > CMakeFiles/llama-bench.dir/llama-bench.i
-
-llama-bench/CMakeFiles/llama-bench.dir/llama-bench.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/llama-bench.dir/llama-bench.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench/llama-bench.cpp -o CMakeFiles/llama-bench.dir/llama-bench.s
-
-# Object files for target llama-bench
-llama__bench_OBJECTS = \
-"CMakeFiles/llama-bench.dir/llama-bench.o"
-
-# External object files for target llama-bench
-llama__bench_EXTERNAL_OBJECTS =
-
-llama-bench/llama-bench: llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o
-llama-bench/llama-bench: llama-bench/CMakeFiles/llama-bench.dir/build.make
-llama-bench/llama-bench: llama-bench/CMakeFiles/llama-bench.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable llama-bench"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/llama-bench.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-llama-bench/CMakeFiles/llama-bench.dir/build: llama-bench/llama-bench
-.PHONY : llama-bench/CMakeFiles/llama-bench.dir/build
-
-llama-bench/CMakeFiles/llama-bench.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench && $(CMAKE_COMMAND) -P CMakeFiles/llama-bench.dir/cmake_clean.cmake
-.PHONY : llama-bench/CMakeFiles/llama-bench.dir/clean
-
-llama-bench/CMakeFiles/llama-bench.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : llama-bench/CMakeFiles/llama-bench.dir/depend
-
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/cmake_clean.cmake b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/cmake_clean.cmake
deleted file mode 100644
index cb6655fb83e41..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/llama-bench.dir/llama-bench.o"
-  "CMakeFiles/llama-bench.dir/llama-bench.o.d"
-  "llama-bench"
-  "llama-bench.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/llama-bench.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make
deleted file mode 100644
index 1c2e072d2d272..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for llama-bench.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts
deleted file mode 100644
index e2fdaf1db7a02..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for llama-bench.
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/depend.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/depend.make
deleted file mode 100644
index f3b3c9b066e7b..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for llama-bench.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/flags.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/link.txt b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/link.txt
deleted file mode 100644
index f751098bcfad6..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/llama-bench.dir/llama-bench.o -o llama-bench  -lcommon -lllama 
diff --git a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/progress.make b/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/progress.make
deleted file mode 100644
index 6ec2abf9db4ad..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/llama-bench.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 21
-CMAKE_PROGRESS_2 = 22
-
diff --git a/examples/sycl/llama-bench/CMakeFiles/progress.marks b/examples/sycl/llama-bench/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/llama-bench/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/llama-bench/Makefile b/examples/sycl/llama-bench/Makefile
deleted file mode 100644
index 295a518e0f08f..0000000000000
--- a/examples/sycl/llama-bench/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-llama-bench/CMakeFiles/llama-bench.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llama-bench/CMakeFiles/llama-bench.dir/rule
-.PHONY : llama-bench/CMakeFiles/llama-bench.dir/rule
-
-# Convenience name for target.
-llama-bench: llama-bench/CMakeFiles/llama-bench.dir/rule
-.PHONY : llama-bench
-
-# fast build rule for target.
-llama-bench/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/build
-.PHONY : llama-bench/fast
-
-# target to build an object file
-llama-bench.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/llama-bench.o
-.PHONY : llama-bench.o
-
-# target to preprocess a source file
-llama-bench.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/llama-bench.i
-.PHONY : llama-bench.i
-
-# target to generate assembly for a file
-llama-bench.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llama-bench/CMakeFiles/llama-bench.dir/build.make llama-bench/CMakeFiles/llama-bench.dir/llama-bench.s
-.PHONY : llama-bench.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... llama-bench"
-	@echo "... llama-bench.o"
-	@echo "... llama-bench.i"
-	@echo "... llama-bench.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/llama-bench/cmake_install.cmake b/examples/sycl/llama-bench/cmake_install.cmake
deleted file mode 100644
index 72de9db7d3b6f..0000000000000
--- a/examples/sycl/llama-bench/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/llama-bench
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llama-bench/llama-bench")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llama-bench")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/llava/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/llava/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/llava/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake b/examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake
deleted file mode 100644
index 891956fae6f83..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava-cli.cpp" "llava/CMakeFiles/llava-cli.dir/llava-cli.o" "gcc" "llava/CMakeFiles/llava-cli.dir/llava-cli.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/build.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/build.make
deleted file mode 100644
index 6459cbf3c94c6..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava-cli.dir/build.make
+++ /dev/null
@@ -1,114 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include llava/CMakeFiles/llava-cli.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include llava/CMakeFiles/llava-cli.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include llava/CMakeFiles/llava-cli.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include llava/CMakeFiles/llava-cli.dir/flags.make
-
-llava/CMakeFiles/llava-cli.dir/llava-cli.o: llava/CMakeFiles/llava-cli.dir/flags.make
-llava/CMakeFiles/llava-cli.dir/llava-cli.o: ../llava/llava-cli.cpp
-llava/CMakeFiles/llava-cli.dir/llava-cli.o: llava/CMakeFiles/llava-cli.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object llava/CMakeFiles/llava-cli.dir/llava-cli.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT llava/CMakeFiles/llava-cli.dir/llava-cli.o -MF CMakeFiles/llava-cli.dir/llava-cli.o.d -o CMakeFiles/llava-cli.dir/llava-cli.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava-cli.cpp
-
-llava/CMakeFiles/llava-cli.dir/llava-cli.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/llava-cli.dir/llava-cli.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava-cli.cpp > CMakeFiles/llava-cli.dir/llava-cli.i
-
-llava/CMakeFiles/llava-cli.dir/llava-cli.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/llava-cli.dir/llava-cli.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava-cli.cpp -o CMakeFiles/llava-cli.dir/llava-cli.s
-
-# Object files for target llava-cli
-llava__cli_OBJECTS = \
-"CMakeFiles/llava-cli.dir/llava-cli.o"
-
-# External object files for target llava-cli
-llava__cli_EXTERNAL_OBJECTS = \
-"/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/llava.o" \
-"/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/clip.o"
-
-llava/llava-cli: llava/CMakeFiles/llava-cli.dir/llava-cli.o
-llava/llava-cli: llava/CMakeFiles/llava.dir/llava.o
-llava/llava-cli: llava/CMakeFiles/llava.dir/clip.o
-llava/llava-cli: llava/CMakeFiles/llava-cli.dir/build.make
-llava/llava-cli: llava/CMakeFiles/llava-cli.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable llava-cli"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/llava-cli.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-llava/CMakeFiles/llava-cli.dir/build: llava/llava-cli
-.PHONY : llava/CMakeFiles/llava-cli.dir/build
-
-llava/CMakeFiles/llava-cli.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -P CMakeFiles/llava-cli.dir/cmake_clean.cmake
-.PHONY : llava/CMakeFiles/llava-cli.dir/clean
-
-llava/CMakeFiles/llava-cli.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava-cli.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : llava/CMakeFiles/llava-cli.dir/depend
-
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/cmake_clean.cmake b/examples/sycl/llava/CMakeFiles/llava-cli.dir/cmake_clean.cmake
deleted file mode 100644
index 9028260a0a143..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava-cli.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/llava-cli.dir/llava-cli.o"
-  "CMakeFiles/llava-cli.dir/llava-cli.o.d"
-  "llava-cli"
-  "llava-cli.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/llava-cli.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.make
deleted file mode 100644
index 0814eb432bb83..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for llava-cli.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.ts b/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.ts
deleted file mode 100644
index 23b629e3f6a90..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava-cli.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for llava-cli.
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/depend.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/depend.make
deleted file mode 100644
index 42964b654d8bf..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava-cli.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for llava-cli.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/flags.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/flags.make
deleted file mode 100644
index dd2a15593d7fa..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava-cli.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/. -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/../.. -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/../../common
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/link.txt b/examples/sycl/llava/CMakeFiles/llava-cli.dir/link.txt
deleted file mode 100644
index 2dc90433d77fe..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava-cli.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/llava-cli.dir/llava-cli.o CMakeFiles/llava.dir/llava.o CMakeFiles/llava.dir/clip.o -o llava-cli  -lcommon -lllama -lggml -lllama 
diff --git a/examples/sycl/llava/CMakeFiles/llava-cli.dir/progress.make b/examples/sycl/llava/CMakeFiles/llava-cli.dir/progress.make
deleted file mode 100644
index 9fd0bf530ff91..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava-cli.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 25
-CMAKE_PROGRESS_2 = 26
-
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake b/examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake
deleted file mode 100644
index 9612457064b23..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake
+++ /dev/null
@@ -1,20 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/llava/clip.cpp" "llava/CMakeFiles/llava.dir/clip.o" "gcc" "llava/CMakeFiles/llava.dir/clip.o.d"
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava.cpp" "llava/CMakeFiles/llava.dir/llava.o" "gcc" "llava/CMakeFiles/llava.dir/llava.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/build.make b/examples/sycl/llava/CMakeFiles/llava.dir/build.make
deleted file mode 100644
index 1750c09553cb0..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava.dir/build.make
+++ /dev/null
@@ -1,116 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include llava/CMakeFiles/llava.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include llava/CMakeFiles/llava.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include llava/CMakeFiles/llava.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include llava/CMakeFiles/llava.dir/flags.make
-
-llava/CMakeFiles/llava.dir/llava.o: llava/CMakeFiles/llava.dir/flags.make
-llava/CMakeFiles/llava.dir/llava.o: ../llava/llava.cpp
-llava/CMakeFiles/llava.dir/llava.o: llava/CMakeFiles/llava.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object llava/CMakeFiles/llava.dir/llava.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT llava/CMakeFiles/llava.dir/llava.o -MF CMakeFiles/llava.dir/llava.o.d -o CMakeFiles/llava.dir/llava.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava.cpp
-
-llava/CMakeFiles/llava.dir/llava.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/llava.dir/llava.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava.cpp > CMakeFiles/llava.dir/llava.i
-
-llava/CMakeFiles/llava.dir/llava.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/llava.dir/llava.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/llava/llava.cpp -o CMakeFiles/llava.dir/llava.s
-
-llava/CMakeFiles/llava.dir/clip.o: llava/CMakeFiles/llava.dir/flags.make
-llava/CMakeFiles/llava.dir/clip.o: ../llava/clip.cpp
-llava/CMakeFiles/llava.dir/clip.o: llava/CMakeFiles/llava.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Building CXX object llava/CMakeFiles/llava.dir/clip.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT llava/CMakeFiles/llava.dir/clip.o -MF CMakeFiles/llava.dir/clip.o.d -o CMakeFiles/llava.dir/clip.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/llava/clip.cpp
-
-llava/CMakeFiles/llava.dir/clip.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/llava.dir/clip.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/llava/clip.cpp > CMakeFiles/llava.dir/clip.i
-
-llava/CMakeFiles/llava.dir/clip.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/llava.dir/clip.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/llava/clip.cpp -o CMakeFiles/llava.dir/clip.s
-
-llava: llava/CMakeFiles/llava.dir/llava.o
-llava: llava/CMakeFiles/llava.dir/clip.o
-llava: llava/CMakeFiles/llava.dir/build.make
-.PHONY : llava
-
-# Rule to build all files generated by this target.
-llava/CMakeFiles/llava.dir/build: llava
-.PHONY : llava/CMakeFiles/llava.dir/build
-
-llava/CMakeFiles/llava.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -P CMakeFiles/llava.dir/cmake_clean.cmake
-.PHONY : llava/CMakeFiles/llava.dir/clean
-
-llava/CMakeFiles/llava.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : llava/CMakeFiles/llava.dir/depend
-
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/cmake_clean.cmake b/examples/sycl/llava/CMakeFiles/llava.dir/cmake_clean.cmake
deleted file mode 100644
index 3883a0e373075..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/llava.dir/clip.o"
-  "CMakeFiles/llava.dir/clip.o.d"
-  "CMakeFiles/llava.dir/llava.o"
-  "CMakeFiles/llava.dir/llava.o.d"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/llava.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.make b/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.make
deleted file mode 100644
index 6dac2f664c5d8..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for llava.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.ts b/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.ts
deleted file mode 100644
index 4e2caa7aebdcf..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for llava.
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/depend.make b/examples/sycl/llava/CMakeFiles/llava.dir/depend.make
deleted file mode 100644
index c853fa62d1aa8..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for llava.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/flags.make b/examples/sycl/llava/CMakeFiles/llava.dir/flags.make
deleted file mode 100644
index badb2e522d3d5..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/. -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/../.. -I/home/jianyuzh/ws/llama.cpp/develop/examples/llava/../../common
-
-CXX_FLAGS = -Wno-cast-qual
-
diff --git a/examples/sycl/llava/CMakeFiles/llava.dir/progress.make b/examples/sycl/llava/CMakeFiles/llava.dir/progress.make
deleted file mode 100644
index 6c29f4fb5e35d..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 23
-CMAKE_PROGRESS_2 = 24
-
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake b/examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake
deleted file mode 100644
index dc55e44b5556f..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake
+++ /dev/null
@@ -1,18 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/build.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/build.make
deleted file mode 100644
index 204fef459a2c5..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/build.make
+++ /dev/null
@@ -1,99 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include llava/CMakeFiles/llava_static.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include llava/CMakeFiles/llava_static.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include llava/CMakeFiles/llava_static.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include llava/CMakeFiles/llava_static.dir/flags.make
-
-# Object files for target llava_static
-llava_static_OBJECTS =
-
-# External object files for target llava_static
-llava_static_EXTERNAL_OBJECTS = \
-"/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/llava.o" \
-"/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava.dir/clip.o"
-
-llava/libllava_static.a: llava/CMakeFiles/llava.dir/llava.o
-llava/libllava_static.a: llava/CMakeFiles/llava.dir/clip.o
-llava/libllava_static.a: llava/CMakeFiles/llava_static.dir/build.make
-llava/libllava_static.a: llava/CMakeFiles/llava_static.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Linking CXX static library libllava_static.a"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -P CMakeFiles/llava_static.dir/cmake_clean_target.cmake
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/llava_static.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-llava/CMakeFiles/llava_static.dir/build: llava/libllava_static.a
-.PHONY : llava/CMakeFiles/llava_static.dir/build
-
-llava/CMakeFiles/llava_static.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava && $(CMAKE_COMMAND) -P CMakeFiles/llava_static.dir/cmake_clean.cmake
-.PHONY : llava/CMakeFiles/llava_static.dir/clean
-
-llava/CMakeFiles/llava_static.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/CMakeFiles/llava_static.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : llava/CMakeFiles/llava_static.dir/depend
-
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean.cmake b/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean.cmake
deleted file mode 100644
index f45eae4621fb1..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,9 +0,0 @@
-file(REMOVE_RECURSE
-  "libllava_static.a"
-  "libllava_static.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/llava_static.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean_target.cmake b/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean_target.cmake
deleted file mode 100644
index 811ad1cc67be8..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/cmake_clean_target.cmake
+++ /dev/null
@@ -1,3 +0,0 @@
-file(REMOVE_RECURSE
-  "libllava_static.a"
-)
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.make
deleted file mode 100644
index 53d1fc0d078f6..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for llava_static.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.ts b/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.ts
deleted file mode 100644
index 795a3979cf036..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for llava_static.
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/depend.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/depend.make
deleted file mode 100644
index dbb3703db5596..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for llava_static.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/flags.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/link.txt b/examples/sycl/llava/CMakeFiles/llava_static.dir/link.txt
deleted file mode 100644
index c3f37a40baa9a..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/link.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-/usr/bin/ar qc libllava_static.a CMakeFiles/llava.dir/llava.o CMakeFiles/llava.dir/clip.o
-/usr/bin/ranlib libllava_static.a
diff --git a/examples/sycl/llava/CMakeFiles/llava_static.dir/progress.make b/examples/sycl/llava/CMakeFiles/llava_static.dir/progress.make
deleted file mode 100644
index 6e61838c631fb..0000000000000
--- a/examples/sycl/llava/CMakeFiles/llava_static.dir/progress.make
+++ /dev/null
@@ -1,2 +0,0 @@
-CMAKE_PROGRESS_1 = 27
-
diff --git a/examples/sycl/llava/CMakeFiles/progress.marks b/examples/sycl/llava/CMakeFiles/progress.marks
deleted file mode 100644
index 7ed6ff82de6bc..0000000000000
--- a/examples/sycl/llava/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-5
diff --git a/examples/sycl/llava/Makefile b/examples/sycl/llava/Makefile
deleted file mode 100644
index fba626a99cbf4..0000000000000
--- a/examples/sycl/llava/Makefile
+++ /dev/null
@@ -1,288 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-llava/CMakeFiles/llava.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava.dir/rule
-.PHONY : llava/CMakeFiles/llava.dir/rule
-
-# Convenience name for target.
-llava: llava/CMakeFiles/llava.dir/rule
-.PHONY : llava
-
-# fast build rule for target.
-llava/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/build
-.PHONY : llava/fast
-
-# Convenience name for target.
-llava/CMakeFiles/llava_static.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava_static.dir/rule
-.PHONY : llava/CMakeFiles/llava_static.dir/rule
-
-# Convenience name for target.
-llava_static: llava/CMakeFiles/llava_static.dir/rule
-.PHONY : llava_static
-
-# fast build rule for target.
-llava_static/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava_static.dir/build.make llava/CMakeFiles/llava_static.dir/build
-.PHONY : llava_static/fast
-
-# Convenience name for target.
-llava/CMakeFiles/llava-cli.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 llava/CMakeFiles/llava-cli.dir/rule
-.PHONY : llava/CMakeFiles/llava-cli.dir/rule
-
-# Convenience name for target.
-llava-cli: llava/CMakeFiles/llava-cli.dir/rule
-.PHONY : llava-cli
-
-# fast build rule for target.
-llava-cli/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/build
-.PHONY : llava-cli/fast
-
-# target to build an object file
-clip.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/clip.o
-.PHONY : clip.o
-
-# target to preprocess a source file
-clip.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/clip.i
-.PHONY : clip.i
-
-# target to generate assembly for a file
-clip.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/clip.s
-.PHONY : clip.s
-
-# target to build an object file
-llava-cli.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/llava-cli.o
-.PHONY : llava-cli.o
-
-# target to preprocess a source file
-llava-cli.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/llava-cli.i
-.PHONY : llava-cli.i
-
-# target to generate assembly for a file
-llava-cli.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava-cli.dir/build.make llava/CMakeFiles/llava-cli.dir/llava-cli.s
-.PHONY : llava-cli.s
-
-# target to build an object file
-llava.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/llava.o
-.PHONY : llava.o
-
-# target to preprocess a source file
-llava.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/llava.i
-.PHONY : llava.i
-
-# target to generate assembly for a file
-llava.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f llava/CMakeFiles/llava.dir/build.make llava/CMakeFiles/llava.dir/llava.s
-.PHONY : llava.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... llava"
-	@echo "... llava-cli"
-	@echo "... llava_static"
-	@echo "... clip.o"
-	@echo "... clip.i"
-	@echo "... clip.s"
-	@echo "... llava-cli.o"
-	@echo "... llava-cli.i"
-	@echo "... llava-cli.s"
-	@echo "... llava.o"
-	@echo "... llava.i"
-	@echo "... llava.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/llava/cmake_install.cmake b/examples/sycl/llava/cmake_install.cmake
deleted file mode 100644
index babdc45902de3..0000000000000
--- a/examples/sycl/llava/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/llava
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/llava/llava-cli")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/llava-cli")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/lookahead/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/lookahead/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake
deleted file mode 100644
index 275ad578cff31..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/lookahead/lookahead.cpp" "lookahead/CMakeFiles/lookahead.dir/lookahead.o" "gcc" "lookahead/CMakeFiles/lookahead.dir/lookahead.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/build.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/build.make
deleted file mode 100644
index 28e8b3553a5b5..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include lookahead/CMakeFiles/lookahead.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include lookahead/CMakeFiles/lookahead.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include lookahead/CMakeFiles/lookahead.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include lookahead/CMakeFiles/lookahead.dir/flags.make
-
-lookahead/CMakeFiles/lookahead.dir/lookahead.o: lookahead/CMakeFiles/lookahead.dir/flags.make
-lookahead/CMakeFiles/lookahead.dir/lookahead.o: ../lookahead/lookahead.cpp
-lookahead/CMakeFiles/lookahead.dir/lookahead.o: lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object lookahead/CMakeFiles/lookahead.dir/lookahead.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT lookahead/CMakeFiles/lookahead.dir/lookahead.o -MF CMakeFiles/lookahead.dir/lookahead.o.d -o CMakeFiles/lookahead.dir/lookahead.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead/lookahead.cpp
-
-lookahead/CMakeFiles/lookahead.dir/lookahead.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/lookahead.dir/lookahead.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead/lookahead.cpp > CMakeFiles/lookahead.dir/lookahead.i
-
-lookahead/CMakeFiles/lookahead.dir/lookahead.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/lookahead.dir/lookahead.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead/lookahead.cpp -o CMakeFiles/lookahead.dir/lookahead.s
-
-# Object files for target lookahead
-lookahead_OBJECTS = \
-"CMakeFiles/lookahead.dir/lookahead.o"
-
-# External object files for target lookahead
-lookahead_EXTERNAL_OBJECTS =
-
-lookahead/lookahead: lookahead/CMakeFiles/lookahead.dir/lookahead.o
-lookahead/lookahead: lookahead/CMakeFiles/lookahead.dir/build.make
-lookahead/lookahead: lookahead/CMakeFiles/lookahead.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable lookahead"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/lookahead.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-lookahead/CMakeFiles/lookahead.dir/build: lookahead/lookahead
-.PHONY : lookahead/CMakeFiles/lookahead.dir/build
-
-lookahead/CMakeFiles/lookahead.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead && $(CMAKE_COMMAND) -P CMakeFiles/lookahead.dir/cmake_clean.cmake
-.PHONY : lookahead/CMakeFiles/lookahead.dir/clean
-
-lookahead/CMakeFiles/lookahead.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/CMakeFiles/lookahead.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : lookahead/CMakeFiles/lookahead.dir/depend
-
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/cmake_clean.cmake b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/cmake_clean.cmake
deleted file mode 100644
index 1784906a410e3..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/lookahead.dir/lookahead.o"
-  "CMakeFiles/lookahead.dir/lookahead.o.d"
-  "lookahead"
-  "lookahead.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/lookahead.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.make
deleted file mode 100644
index e16e15b4ebd1c..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for lookahead.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts
deleted file mode 100644
index abdca07390ae2..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for lookahead.
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/depend.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/depend.make
deleted file mode 100644
index 0aaa54f625ea9..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for lookahead.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/flags.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/link.txt b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/link.txt
deleted file mode 100644
index 490524c8c691f..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/lookahead.dir/lookahead.o -o lookahead  -lcommon -lllama 
diff --git a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/progress.make b/examples/sycl/lookahead/CMakeFiles/lookahead.dir/progress.make
deleted file mode 100644
index ec204d332ef87..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/lookahead.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 28
-CMAKE_PROGRESS_2 = 29
-
diff --git a/examples/sycl/lookahead/CMakeFiles/progress.marks b/examples/sycl/lookahead/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/lookahead/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/lookahead/Makefile b/examples/sycl/lookahead/Makefile
deleted file mode 100644
index b6028417d5939..0000000000000
--- a/examples/sycl/lookahead/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-lookahead/CMakeFiles/lookahead.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookahead/CMakeFiles/lookahead.dir/rule
-.PHONY : lookahead/CMakeFiles/lookahead.dir/rule
-
-# Convenience name for target.
-lookahead: lookahead/CMakeFiles/lookahead.dir/rule
-.PHONY : lookahead
-
-# fast build rule for target.
-lookahead/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/build
-.PHONY : lookahead/fast
-
-# target to build an object file
-lookahead.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/lookahead.o
-.PHONY : lookahead.o
-
-# target to preprocess a source file
-lookahead.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/lookahead.i
-.PHONY : lookahead.i
-
-# target to generate assembly for a file
-lookahead.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookahead/CMakeFiles/lookahead.dir/build.make lookahead/CMakeFiles/lookahead.dir/lookahead.s
-.PHONY : lookahead.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... lookahead"
-	@echo "... lookahead.o"
-	@echo "... lookahead.i"
-	@echo "... lookahead.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/lookahead/cmake_install.cmake b/examples/sycl/lookahead/cmake_install.cmake
deleted file mode 100644
index 9a2af72130d76..0000000000000
--- a/examples/sycl/lookahead/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/lookahead
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookahead/lookahead")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookahead")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/lookup/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/lookup/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake b/examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake
deleted file mode 100644
index 235904a16eade..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/lookup/lookup.cpp" "lookup/CMakeFiles/lookup.dir/lookup.o" "gcc" "lookup/CMakeFiles/lookup.dir/lookup.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/build.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/build.make
deleted file mode 100644
index 4870a818310f0..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/lookup.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include lookup/CMakeFiles/lookup.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include lookup/CMakeFiles/lookup.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include lookup/CMakeFiles/lookup.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include lookup/CMakeFiles/lookup.dir/flags.make
-
-lookup/CMakeFiles/lookup.dir/lookup.o: lookup/CMakeFiles/lookup.dir/flags.make
-lookup/CMakeFiles/lookup.dir/lookup.o: ../lookup/lookup.cpp
-lookup/CMakeFiles/lookup.dir/lookup.o: lookup/CMakeFiles/lookup.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object lookup/CMakeFiles/lookup.dir/lookup.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT lookup/CMakeFiles/lookup.dir/lookup.o -MF CMakeFiles/lookup.dir/lookup.o.d -o CMakeFiles/lookup.dir/lookup.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/lookup/lookup.cpp
-
-lookup/CMakeFiles/lookup.dir/lookup.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/lookup.dir/lookup.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/lookup/lookup.cpp > CMakeFiles/lookup.dir/lookup.i
-
-lookup/CMakeFiles/lookup.dir/lookup.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/lookup.dir/lookup.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/lookup/lookup.cpp -o CMakeFiles/lookup.dir/lookup.s
-
-# Object files for target lookup
-lookup_OBJECTS = \
-"CMakeFiles/lookup.dir/lookup.o"
-
-# External object files for target lookup
-lookup_EXTERNAL_OBJECTS =
-
-lookup/lookup: lookup/CMakeFiles/lookup.dir/lookup.o
-lookup/lookup: lookup/CMakeFiles/lookup.dir/build.make
-lookup/lookup: lookup/CMakeFiles/lookup.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable lookup"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/lookup.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-lookup/CMakeFiles/lookup.dir/build: lookup/lookup
-.PHONY : lookup/CMakeFiles/lookup.dir/build
-
-lookup/CMakeFiles/lookup.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup && $(CMAKE_COMMAND) -P CMakeFiles/lookup.dir/cmake_clean.cmake
-.PHONY : lookup/CMakeFiles/lookup.dir/clean
-
-lookup/CMakeFiles/lookup.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/lookup /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/CMakeFiles/lookup.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : lookup/CMakeFiles/lookup.dir/depend
-
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/cmake_clean.cmake b/examples/sycl/lookup/CMakeFiles/lookup.dir/cmake_clean.cmake
deleted file mode 100644
index 395eb958ce111..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/lookup.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/lookup.dir/lookup.o"
-  "CMakeFiles/lookup.dir/lookup.o.d"
-  "lookup"
-  "lookup.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/lookup.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.make
deleted file mode 100644
index ee5894814a53c..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for lookup.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.ts b/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.ts
deleted file mode 100644
index 7fcb2448b23bf..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/lookup.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for lookup.
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/depend.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/depend.make
deleted file mode 100644
index 806a9a3b1639d..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/lookup.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for lookup.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/flags.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/lookup.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/link.txt b/examples/sycl/lookup/CMakeFiles/lookup.dir/link.txt
deleted file mode 100644
index 7f181c941a9c8..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/lookup.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/lookup.dir/lookup.o -o lookup  -lcommon -lllama 
diff --git a/examples/sycl/lookup/CMakeFiles/lookup.dir/progress.make b/examples/sycl/lookup/CMakeFiles/lookup.dir/progress.make
deleted file mode 100644
index 335ef43aa78f1..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/lookup.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 30
-CMAKE_PROGRESS_2 = 31
-
diff --git a/examples/sycl/lookup/CMakeFiles/progress.marks b/examples/sycl/lookup/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/lookup/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/lookup/Makefile b/examples/sycl/lookup/Makefile
deleted file mode 100644
index d125b37f53c62..0000000000000
--- a/examples/sycl/lookup/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-lookup/CMakeFiles/lookup.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 lookup/CMakeFiles/lookup.dir/rule
-.PHONY : lookup/CMakeFiles/lookup.dir/rule
-
-# Convenience name for target.
-lookup: lookup/CMakeFiles/lookup.dir/rule
-.PHONY : lookup
-
-# fast build rule for target.
-lookup/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/build
-.PHONY : lookup/fast
-
-# target to build an object file
-lookup.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/lookup.o
-.PHONY : lookup.o
-
-# target to preprocess a source file
-lookup.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/lookup.i
-.PHONY : lookup.i
-
-# target to generate assembly for a file
-lookup.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f lookup/CMakeFiles/lookup.dir/build.make lookup/CMakeFiles/lookup.dir/lookup.s
-.PHONY : lookup.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... lookup"
-	@echo "... lookup.o"
-	@echo "... lookup.i"
-	@echo "... lookup.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/lookup/cmake_install.cmake b/examples/sycl/lookup/cmake_install.cmake
deleted file mode 100644
index 0c81d00f52886..0000000000000
--- a/examples/sycl/lookup/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/lookup
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/lookup/lookup")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/lookup")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/main/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/main/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/main/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake b/examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake
deleted file mode 100644
index 397618f7b1c3b..0000000000000
--- a/examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/main/main.cpp" "main/CMakeFiles/main.dir/main.o" "gcc" "main/CMakeFiles/main.dir/main.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/main/CMakeFiles/main.dir/build.make b/examples/sycl/main/CMakeFiles/main.dir/build.make
deleted file mode 100644
index 048af40ba5b24..0000000000000
--- a/examples/sycl/main/CMakeFiles/main.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include main/CMakeFiles/main.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include main/CMakeFiles/main.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include main/CMakeFiles/main.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include main/CMakeFiles/main.dir/flags.make
-
-main/CMakeFiles/main.dir/main.o: main/CMakeFiles/main.dir/flags.make
-main/CMakeFiles/main.dir/main.o: ../main/main.cpp
-main/CMakeFiles/main.dir/main.o: main/CMakeFiles/main.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object main/CMakeFiles/main.dir/main.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT main/CMakeFiles/main.dir/main.o -MF CMakeFiles/main.dir/main.o.d -o CMakeFiles/main.dir/main.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/main/main.cpp
-
-main/CMakeFiles/main.dir/main.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/main.dir/main.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/main/main.cpp > CMakeFiles/main.dir/main.i
-
-main/CMakeFiles/main.dir/main.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/main.dir/main.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/main/main.cpp -o CMakeFiles/main.dir/main.s
-
-# Object files for target main
-main_OBJECTS = \
-"CMakeFiles/main.dir/main.o"
-
-# External object files for target main
-main_EXTERNAL_OBJECTS =
-
-main/main: main/CMakeFiles/main.dir/main.o
-main/main: main/CMakeFiles/main.dir/build.make
-main/main: main/CMakeFiles/main.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable main"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/main.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-main/CMakeFiles/main.dir/build: main/main
-.PHONY : main/CMakeFiles/main.dir/build
-
-main/CMakeFiles/main.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main && $(CMAKE_COMMAND) -P CMakeFiles/main.dir/cmake_clean.cmake
-.PHONY : main/CMakeFiles/main.dir/clean
-
-main/CMakeFiles/main.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/main /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/CMakeFiles/main.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : main/CMakeFiles/main.dir/depend
-
diff --git a/examples/sycl/main/CMakeFiles/main.dir/cmake_clean.cmake b/examples/sycl/main/CMakeFiles/main.dir/cmake_clean.cmake
deleted file mode 100644
index f408648c7af77..0000000000000
--- a/examples/sycl/main/CMakeFiles/main.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/main.dir/main.o"
-  "CMakeFiles/main.dir/main.o.d"
-  "main"
-  "main.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/main.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.make b/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.make
deleted file mode 100644
index 386f0a8f7a3c6..0000000000000
--- a/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for main.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.ts b/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.ts
deleted file mode 100644
index a3467f4e27f70..0000000000000
--- a/examples/sycl/main/CMakeFiles/main.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for main.
diff --git a/examples/sycl/main/CMakeFiles/main.dir/depend.make b/examples/sycl/main/CMakeFiles/main.dir/depend.make
deleted file mode 100644
index 7e75fee722d29..0000000000000
--- a/examples/sycl/main/CMakeFiles/main.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for main.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/main/CMakeFiles/main.dir/flags.make b/examples/sycl/main/CMakeFiles/main.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/main/CMakeFiles/main.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/main/CMakeFiles/main.dir/link.txt b/examples/sycl/main/CMakeFiles/main.dir/link.txt
deleted file mode 100644
index 0127ad4f185dd..0000000000000
--- a/examples/sycl/main/CMakeFiles/main.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/main.dir/main.o -o main  -lcommon -lllama 
diff --git a/examples/sycl/main/CMakeFiles/main.dir/progress.make b/examples/sycl/main/CMakeFiles/main.dir/progress.make
deleted file mode 100644
index 30c309113cd04..0000000000000
--- a/examples/sycl/main/CMakeFiles/main.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 34
-CMAKE_PROGRESS_2 = 35
-
diff --git a/examples/sycl/main/CMakeFiles/progress.marks b/examples/sycl/main/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/main/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/main/Makefile b/examples/sycl/main/Makefile
deleted file mode 100644
index bcd6c6c9391dc..0000000000000
--- a/examples/sycl/main/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-main/CMakeFiles/main.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 main/CMakeFiles/main.dir/rule
-.PHONY : main/CMakeFiles/main.dir/rule
-
-# Convenience name for target.
-main: main/CMakeFiles/main.dir/rule
-.PHONY : main
-
-# fast build rule for target.
-main/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/build
-.PHONY : main/fast
-
-# target to build an object file
-main.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/main.o
-.PHONY : main.o
-
-# target to preprocess a source file
-main.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/main.i
-.PHONY : main.i
-
-# target to generate assembly for a file
-main.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f main/CMakeFiles/main.dir/build.make main/CMakeFiles/main.dir/main.s
-.PHONY : main.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... main"
-	@echo "... main.o"
-	@echo "... main.i"
-	@echo "... main.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/main/cmake_install.cmake b/examples/sycl/main/cmake_install.cmake
deleted file mode 100644
index dcec227e3f9d0..0000000000000
--- a/examples/sycl/main/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/main
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/main/main")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/main")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/parallel/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/parallel/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake b/examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake
deleted file mode 100644
index 6e1d429f7cd8f..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/parallel/parallel.cpp" "parallel/CMakeFiles/parallel.dir/parallel.o" "gcc" "parallel/CMakeFiles/parallel.dir/parallel.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/build.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/build.make
deleted file mode 100644
index c982dff8926cc..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/parallel.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include parallel/CMakeFiles/parallel.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include parallel/CMakeFiles/parallel.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include parallel/CMakeFiles/parallel.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include parallel/CMakeFiles/parallel.dir/flags.make
-
-parallel/CMakeFiles/parallel.dir/parallel.o: parallel/CMakeFiles/parallel.dir/flags.make
-parallel/CMakeFiles/parallel.dir/parallel.o: ../parallel/parallel.cpp
-parallel/CMakeFiles/parallel.dir/parallel.o: parallel/CMakeFiles/parallel.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object parallel/CMakeFiles/parallel.dir/parallel.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT parallel/CMakeFiles/parallel.dir/parallel.o -MF CMakeFiles/parallel.dir/parallel.o.d -o CMakeFiles/parallel.dir/parallel.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/parallel/parallel.cpp
-
-parallel/CMakeFiles/parallel.dir/parallel.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/parallel.dir/parallel.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/parallel/parallel.cpp > CMakeFiles/parallel.dir/parallel.i
-
-parallel/CMakeFiles/parallel.dir/parallel.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/parallel.dir/parallel.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/parallel/parallel.cpp -o CMakeFiles/parallel.dir/parallel.s
-
-# Object files for target parallel
-parallel_OBJECTS = \
-"CMakeFiles/parallel.dir/parallel.o"
-
-# External object files for target parallel
-parallel_EXTERNAL_OBJECTS =
-
-parallel/parallel: parallel/CMakeFiles/parallel.dir/parallel.o
-parallel/parallel: parallel/CMakeFiles/parallel.dir/build.make
-parallel/parallel: parallel/CMakeFiles/parallel.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable parallel"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/parallel.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-parallel/CMakeFiles/parallel.dir/build: parallel/parallel
-.PHONY : parallel/CMakeFiles/parallel.dir/build
-
-parallel/CMakeFiles/parallel.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel && $(CMAKE_COMMAND) -P CMakeFiles/parallel.dir/cmake_clean.cmake
-.PHONY : parallel/CMakeFiles/parallel.dir/clean
-
-parallel/CMakeFiles/parallel.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/parallel /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/CMakeFiles/parallel.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : parallel/CMakeFiles/parallel.dir/depend
-
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/cmake_clean.cmake b/examples/sycl/parallel/CMakeFiles/parallel.dir/cmake_clean.cmake
deleted file mode 100644
index 88fdf52287490..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/parallel.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/parallel.dir/parallel.o"
-  "CMakeFiles/parallel.dir/parallel.o.d"
-  "parallel"
-  "parallel.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/parallel.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.make
deleted file mode 100644
index 4adb5e4069cbb..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for parallel.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.ts b/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.ts
deleted file mode 100644
index 4fc0c95335bab..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/parallel.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for parallel.
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/depend.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/depend.make
deleted file mode 100644
index bb47426afbbe8..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/parallel.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for parallel.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/flags.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/parallel.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/link.txt b/examples/sycl/parallel/CMakeFiles/parallel.dir/link.txt
deleted file mode 100644
index d242b572462cd..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/parallel.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/parallel.dir/parallel.o -o parallel  -lcommon -lllama 
diff --git a/examples/sycl/parallel/CMakeFiles/parallel.dir/progress.make b/examples/sycl/parallel/CMakeFiles/parallel.dir/progress.make
deleted file mode 100644
index 2178e35f72e84..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/parallel.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 36
-CMAKE_PROGRESS_2 = 37
-
diff --git a/examples/sycl/parallel/CMakeFiles/progress.marks b/examples/sycl/parallel/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/parallel/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/parallel/Makefile b/examples/sycl/parallel/Makefile
deleted file mode 100644
index 06e1de1287b9b..0000000000000
--- a/examples/sycl/parallel/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-parallel/CMakeFiles/parallel.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 parallel/CMakeFiles/parallel.dir/rule
-.PHONY : parallel/CMakeFiles/parallel.dir/rule
-
-# Convenience name for target.
-parallel: parallel/CMakeFiles/parallel.dir/rule
-.PHONY : parallel
-
-# fast build rule for target.
-parallel/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/build
-.PHONY : parallel/fast
-
-# target to build an object file
-parallel.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/parallel.o
-.PHONY : parallel.o
-
-# target to preprocess a source file
-parallel.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/parallel.i
-.PHONY : parallel.i
-
-# target to generate assembly for a file
-parallel.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f parallel/CMakeFiles/parallel.dir/build.make parallel/CMakeFiles/parallel.dir/parallel.s
-.PHONY : parallel.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... parallel"
-	@echo "... parallel.o"
-	@echo "... parallel.i"
-	@echo "... parallel.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/parallel/cmake_install.cmake b/examples/sycl/parallel/cmake_install.cmake
deleted file mode 100644
index 495382e76897a..0000000000000
--- a/examples/sycl/parallel/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/parallel
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/parallel/parallel")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/parallel")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/perplexity/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/perplexity/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake
deleted file mode 100644
index 66b8b5f1ba73e..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/perplexity/perplexity.cpp" "perplexity/CMakeFiles/perplexity.dir/perplexity.o" "gcc" "perplexity/CMakeFiles/perplexity.dir/perplexity.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/build.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/build.make
deleted file mode 100644
index 85b10e61c975f..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include perplexity/CMakeFiles/perplexity.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include perplexity/CMakeFiles/perplexity.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include perplexity/CMakeFiles/perplexity.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include perplexity/CMakeFiles/perplexity.dir/flags.make
-
-perplexity/CMakeFiles/perplexity.dir/perplexity.o: perplexity/CMakeFiles/perplexity.dir/flags.make
-perplexity/CMakeFiles/perplexity.dir/perplexity.o: ../perplexity/perplexity.cpp
-perplexity/CMakeFiles/perplexity.dir/perplexity.o: perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object perplexity/CMakeFiles/perplexity.dir/perplexity.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT perplexity/CMakeFiles/perplexity.dir/perplexity.o -MF CMakeFiles/perplexity.dir/perplexity.o.d -o CMakeFiles/perplexity.dir/perplexity.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity/perplexity.cpp
-
-perplexity/CMakeFiles/perplexity.dir/perplexity.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/perplexity.dir/perplexity.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity/perplexity.cpp > CMakeFiles/perplexity.dir/perplexity.i
-
-perplexity/CMakeFiles/perplexity.dir/perplexity.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/perplexity.dir/perplexity.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity/perplexity.cpp -o CMakeFiles/perplexity.dir/perplexity.s
-
-# Object files for target perplexity
-perplexity_OBJECTS = \
-"CMakeFiles/perplexity.dir/perplexity.o"
-
-# External object files for target perplexity
-perplexity_EXTERNAL_OBJECTS =
-
-perplexity/perplexity: perplexity/CMakeFiles/perplexity.dir/perplexity.o
-perplexity/perplexity: perplexity/CMakeFiles/perplexity.dir/build.make
-perplexity/perplexity: perplexity/CMakeFiles/perplexity.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable perplexity"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/perplexity.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-perplexity/CMakeFiles/perplexity.dir/build: perplexity/perplexity
-.PHONY : perplexity/CMakeFiles/perplexity.dir/build
-
-perplexity/CMakeFiles/perplexity.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity && $(CMAKE_COMMAND) -P CMakeFiles/perplexity.dir/cmake_clean.cmake
-.PHONY : perplexity/CMakeFiles/perplexity.dir/clean
-
-perplexity/CMakeFiles/perplexity.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/CMakeFiles/perplexity.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : perplexity/CMakeFiles/perplexity.dir/depend
-
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/cmake_clean.cmake b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/cmake_clean.cmake
deleted file mode 100644
index b4c2d0a0102e5..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/perplexity.dir/perplexity.o"
-  "CMakeFiles/perplexity.dir/perplexity.o.d"
-  "perplexity"
-  "perplexity.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/perplexity.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.make
deleted file mode 100644
index 5bde1b10339f0..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for perplexity.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts
deleted file mode 100644
index 5fe121b3b05c1..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for perplexity.
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/depend.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/depend.make
deleted file mode 100644
index 405206272bc89..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for perplexity.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/flags.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/link.txt b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/link.txt
deleted file mode 100644
index c08ea7dca9943..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/perplexity.dir/perplexity.o -o perplexity  -lcommon -lllama 
diff --git a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/progress.make b/examples/sycl/perplexity/CMakeFiles/perplexity.dir/progress.make
deleted file mode 100644
index ad2d357457759..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/perplexity.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 38
-CMAKE_PROGRESS_2 = 39
-
diff --git a/examples/sycl/perplexity/CMakeFiles/progress.marks b/examples/sycl/perplexity/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/perplexity/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/perplexity/Makefile b/examples/sycl/perplexity/Makefile
deleted file mode 100644
index 0eefcb75e12a6..0000000000000
--- a/examples/sycl/perplexity/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-perplexity/CMakeFiles/perplexity.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 perplexity/CMakeFiles/perplexity.dir/rule
-.PHONY : perplexity/CMakeFiles/perplexity.dir/rule
-
-# Convenience name for target.
-perplexity: perplexity/CMakeFiles/perplexity.dir/rule
-.PHONY : perplexity
-
-# fast build rule for target.
-perplexity/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/build
-.PHONY : perplexity/fast
-
-# target to build an object file
-perplexity.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/perplexity.o
-.PHONY : perplexity.o
-
-# target to preprocess a source file
-perplexity.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/perplexity.i
-.PHONY : perplexity.i
-
-# target to generate assembly for a file
-perplexity.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f perplexity/CMakeFiles/perplexity.dir/build.make perplexity/CMakeFiles/perplexity.dir/perplexity.s
-.PHONY : perplexity.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... perplexity"
-	@echo "... perplexity.o"
-	@echo "... perplexity.i"
-	@echo "... perplexity.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/perplexity/cmake_install.cmake b/examples/sycl/perplexity/cmake_install.cmake
deleted file mode 100644
index 7ab46c66fc764..0000000000000
--- a/examples/sycl/perplexity/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/perplexity
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/perplexity/perplexity")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/perplexity")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/quantize-stats/CMakeFiles/progress.marks b/examples/sycl/quantize-stats/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake
deleted file mode 100644
index 2adf5eb27255b..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/quantize-stats.cpp" "quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o" "gcc" "quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/build.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/build.make
deleted file mode 100644
index b0f7750b26259..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include quantize-stats/CMakeFiles/quantize-stats.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include quantize-stats/CMakeFiles/quantize-stats.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
-
-quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o: quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
-quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o: ../quantize-stats/quantize-stats.cpp
-quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o: quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o -MF CMakeFiles/quantize-stats.dir/quantize-stats.o.d -o CMakeFiles/quantize-stats.dir/quantize-stats.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/quantize-stats.cpp
-
-quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/quantize-stats.dir/quantize-stats.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/quantize-stats.cpp > CMakeFiles/quantize-stats.dir/quantize-stats.i
-
-quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/quantize-stats.dir/quantize-stats.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/quantize-stats.cpp -o CMakeFiles/quantize-stats.dir/quantize-stats.s
-
-# Object files for target quantize-stats
-quantize__stats_OBJECTS = \
-"CMakeFiles/quantize-stats.dir/quantize-stats.o"
-
-# External object files for target quantize-stats
-quantize__stats_EXTERNAL_OBJECTS =
-
-quantize-stats/quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o
-quantize-stats/quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/build.make
-quantize-stats/quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable quantize-stats"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/quantize-stats.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-quantize-stats/CMakeFiles/quantize-stats.dir/build: quantize-stats/quantize-stats
-.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/build
-
-quantize-stats/CMakeFiles/quantize-stats.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats && $(CMAKE_COMMAND) -P CMakeFiles/quantize-stats.dir/cmake_clean.cmake
-.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/clean
-
-quantize-stats/CMakeFiles/quantize-stats.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/depend
-
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/cmake_clean.cmake b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/cmake_clean.cmake
deleted file mode 100644
index 3561c9f782621..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/quantize-stats.dir/quantize-stats.o"
-  "CMakeFiles/quantize-stats.dir/quantize-stats.o.d"
-  "quantize-stats"
-  "quantize-stats.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/quantize-stats.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make
deleted file mode 100644
index 9dc6febaed86d..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for quantize-stats.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts
deleted file mode 100644
index 651269eca6c8d..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for quantize-stats.
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/depend.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/depend.make
deleted file mode 100644
index 107cbdfe92549..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for quantize-stats.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/flags.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
deleted file mode 100644
index 94ae3d85e6a12..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats/../../common
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/link.txt b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/link.txt
deleted file mode 100644
index fc2253c853144..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/quantize-stats.dir/quantize-stats.o -o quantize-stats  -lllama -lbuild_info 
diff --git a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/progress.make b/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/progress.make
deleted file mode 100644
index 0ae216536520f..0000000000000
--- a/examples/sycl/quantize-stats/CMakeFiles/quantize-stats.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 42
-CMAKE_PROGRESS_2 = 43
-
diff --git a/examples/sycl/quantize-stats/Makefile b/examples/sycl/quantize-stats/Makefile
deleted file mode 100644
index 96034d833233a..0000000000000
--- a/examples/sycl/quantize-stats/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-quantize-stats/CMakeFiles/quantize-stats.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize-stats/CMakeFiles/quantize-stats.dir/rule
-.PHONY : quantize-stats/CMakeFiles/quantize-stats.dir/rule
-
-# Convenience name for target.
-quantize-stats: quantize-stats/CMakeFiles/quantize-stats.dir/rule
-.PHONY : quantize-stats
-
-# fast build rule for target.
-quantize-stats/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/build
-.PHONY : quantize-stats/fast
-
-# target to build an object file
-quantize-stats.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.o
-.PHONY : quantize-stats.o
-
-# target to preprocess a source file
-quantize-stats.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.i
-.PHONY : quantize-stats.i
-
-# target to generate assembly for a file
-quantize-stats.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize-stats/CMakeFiles/quantize-stats.dir/build.make quantize-stats/CMakeFiles/quantize-stats.dir/quantize-stats.s
-.PHONY : quantize-stats.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... quantize-stats"
-	@echo "... quantize-stats.o"
-	@echo "... quantize-stats.i"
-	@echo "... quantize-stats.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/quantize-stats/cmake_install.cmake b/examples/sycl/quantize-stats/cmake_install.cmake
deleted file mode 100644
index 0be2f85c929db..0000000000000
--- a/examples/sycl/quantize-stats/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/quantize-stats
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize-stats/quantize-stats")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize-stats")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/quantize/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/quantize/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/quantize/CMakeFiles/progress.marks b/examples/sycl/quantize/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake b/examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake
deleted file mode 100644
index f61118e2cf827..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/quantize/quantize.cpp" "quantize/CMakeFiles/quantize.dir/quantize.o" "gcc" "quantize/CMakeFiles/quantize.dir/quantize.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/build.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/build.make
deleted file mode 100644
index bdc0585c11831..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/quantize.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include quantize/CMakeFiles/quantize.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include quantize/CMakeFiles/quantize.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include quantize/CMakeFiles/quantize.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include quantize/CMakeFiles/quantize.dir/flags.make
-
-quantize/CMakeFiles/quantize.dir/quantize.o: quantize/CMakeFiles/quantize.dir/flags.make
-quantize/CMakeFiles/quantize.dir/quantize.o: ../quantize/quantize.cpp
-quantize/CMakeFiles/quantize.dir/quantize.o: quantize/CMakeFiles/quantize.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object quantize/CMakeFiles/quantize.dir/quantize.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT quantize/CMakeFiles/quantize.dir/quantize.o -MF CMakeFiles/quantize.dir/quantize.o.d -o CMakeFiles/quantize.dir/quantize.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/quantize/quantize.cpp
-
-quantize/CMakeFiles/quantize.dir/quantize.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/quantize.dir/quantize.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/quantize/quantize.cpp > CMakeFiles/quantize.dir/quantize.i
-
-quantize/CMakeFiles/quantize.dir/quantize.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/quantize.dir/quantize.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/quantize/quantize.cpp -o CMakeFiles/quantize.dir/quantize.s
-
-# Object files for target quantize
-quantize_OBJECTS = \
-"CMakeFiles/quantize.dir/quantize.o"
-
-# External object files for target quantize
-quantize_EXTERNAL_OBJECTS =
-
-quantize/quantize: quantize/CMakeFiles/quantize.dir/quantize.o
-quantize/quantize: quantize/CMakeFiles/quantize.dir/build.make
-quantize/quantize: quantize/CMakeFiles/quantize.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable quantize"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/quantize.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-quantize/CMakeFiles/quantize.dir/build: quantize/quantize
-.PHONY : quantize/CMakeFiles/quantize.dir/build
-
-quantize/CMakeFiles/quantize.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize && $(CMAKE_COMMAND) -P CMakeFiles/quantize.dir/cmake_clean.cmake
-.PHONY : quantize/CMakeFiles/quantize.dir/clean
-
-quantize/CMakeFiles/quantize.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/quantize /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/CMakeFiles/quantize.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : quantize/CMakeFiles/quantize.dir/depend
-
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/cmake_clean.cmake b/examples/sycl/quantize/CMakeFiles/quantize.dir/cmake_clean.cmake
deleted file mode 100644
index c1f5c99ba49e6..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/quantize.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/quantize.dir/quantize.o"
-  "CMakeFiles/quantize.dir/quantize.o.d"
-  "quantize"
-  "quantize.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/quantize.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.make
deleted file mode 100644
index 7c3c9a8f56920..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for quantize.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.ts b/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.ts
deleted file mode 100644
index 47e3bfba7fda6..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/quantize.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for quantize.
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/depend.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/depend.make
deleted file mode 100644
index 7ce880b4e6ff6..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/quantize.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for quantize.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/flags.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/flags.make
deleted file mode 100644
index e2c15ffddb1bd..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/quantize.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples -I/home/jianyuzh/ws/llama.cpp/develop/examples/quantize/../../common
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/link.txt b/examples/sycl/quantize/CMakeFiles/quantize.dir/link.txt
deleted file mode 100644
index bbe5e4877c944..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/quantize.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/quantize.dir/quantize.o -o quantize  -lllama -lbuild_info 
diff --git a/examples/sycl/quantize/CMakeFiles/quantize.dir/progress.make b/examples/sycl/quantize/CMakeFiles/quantize.dir/progress.make
deleted file mode 100644
index eb247b0834970..0000000000000
--- a/examples/sycl/quantize/CMakeFiles/quantize.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 40
-CMAKE_PROGRESS_2 = 41
-
diff --git a/examples/sycl/quantize/Makefile b/examples/sycl/quantize/Makefile
deleted file mode 100644
index d2ba6116c01c4..0000000000000
--- a/examples/sycl/quantize/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-quantize/CMakeFiles/quantize.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 quantize/CMakeFiles/quantize.dir/rule
-.PHONY : quantize/CMakeFiles/quantize.dir/rule
-
-# Convenience name for target.
-quantize: quantize/CMakeFiles/quantize.dir/rule
-.PHONY : quantize
-
-# fast build rule for target.
-quantize/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/build
-.PHONY : quantize/fast
-
-# target to build an object file
-quantize.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/quantize.o
-.PHONY : quantize.o
-
-# target to preprocess a source file
-quantize.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/quantize.i
-.PHONY : quantize.i
-
-# target to generate assembly for a file
-quantize.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f quantize/CMakeFiles/quantize.dir/build.make quantize/CMakeFiles/quantize.dir/quantize.s
-.PHONY : quantize.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... quantize"
-	@echo "... quantize.o"
-	@echo "... quantize.i"
-	@echo "... quantize.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/quantize/cmake_install.cmake b/examples/sycl/quantize/cmake_install.cmake
deleted file mode 100644
index b809c4e4a77cf..0000000000000
--- a/examples/sycl/quantize/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/quantize
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/quantize/quantize")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/quantize")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/save-load-state/CMakeFiles/progress.marks b/examples/sycl/save-load-state/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake
deleted file mode 100644
index 9fc824e9a7976..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state/save-load-state.cpp" "save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o" "gcc" "save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/build.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/build.make
deleted file mode 100644
index ac2b2a2609fee..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include save-load-state/CMakeFiles/save-load-state.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include save-load-state/CMakeFiles/save-load-state.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include save-load-state/CMakeFiles/save-load-state.dir/flags.make
-
-save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o: save-load-state/CMakeFiles/save-load-state.dir/flags.make
-save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o: ../save-load-state/save-load-state.cpp
-save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o: save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o -MF CMakeFiles/save-load-state.dir/save-load-state.o.d -o CMakeFiles/save-load-state.dir/save-load-state.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state/save-load-state.cpp
-
-save-load-state/CMakeFiles/save-load-state.dir/save-load-state.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/save-load-state.dir/save-load-state.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state/save-load-state.cpp > CMakeFiles/save-load-state.dir/save-load-state.i
-
-save-load-state/CMakeFiles/save-load-state.dir/save-load-state.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/save-load-state.dir/save-load-state.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state/save-load-state.cpp -o CMakeFiles/save-load-state.dir/save-load-state.s
-
-# Object files for target save-load-state
-save__load__state_OBJECTS = \
-"CMakeFiles/save-load-state.dir/save-load-state.o"
-
-# External object files for target save-load-state
-save__load__state_EXTERNAL_OBJECTS =
-
-save-load-state/save-load-state: save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o
-save-load-state/save-load-state: save-load-state/CMakeFiles/save-load-state.dir/build.make
-save-load-state/save-load-state: save-load-state/CMakeFiles/save-load-state.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable save-load-state"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/save-load-state.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-save-load-state/CMakeFiles/save-load-state.dir/build: save-load-state/save-load-state
-.PHONY : save-load-state/CMakeFiles/save-load-state.dir/build
-
-save-load-state/CMakeFiles/save-load-state.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state && $(CMAKE_COMMAND) -P CMakeFiles/save-load-state.dir/cmake_clean.cmake
-.PHONY : save-load-state/CMakeFiles/save-load-state.dir/clean
-
-save-load-state/CMakeFiles/save-load-state.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : save-load-state/CMakeFiles/save-load-state.dir/depend
-
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/cmake_clean.cmake b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/cmake_clean.cmake
deleted file mode 100644
index af5b89fcb9eca..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/save-load-state.dir/save-load-state.o"
-  "CMakeFiles/save-load-state.dir/save-load-state.o.d"
-  "save-load-state"
-  "save-load-state.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/save-load-state.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make
deleted file mode 100644
index a4577943f7e2d..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for save-load-state.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts
deleted file mode 100644
index 64d7612a4a24d..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for save-load-state.
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/depend.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/depend.make
deleted file mode 100644
index f99caaf712a32..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for save-load-state.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/flags.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/link.txt b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/link.txt
deleted file mode 100644
index b1324b558d89e..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/save-load-state.dir/save-load-state.o -o save-load-state  -lcommon -lllama 
diff --git a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/progress.make b/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/progress.make
deleted file mode 100644
index c51eef3db77b0..0000000000000
--- a/examples/sycl/save-load-state/CMakeFiles/save-load-state.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 44
-CMAKE_PROGRESS_2 = 45
-
diff --git a/examples/sycl/save-load-state/Makefile b/examples/sycl/save-load-state/Makefile
deleted file mode 100644
index c9509ab5c7c91..0000000000000
--- a/examples/sycl/save-load-state/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-save-load-state/CMakeFiles/save-load-state.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 save-load-state/CMakeFiles/save-load-state.dir/rule
-.PHONY : save-load-state/CMakeFiles/save-load-state.dir/rule
-
-# Convenience name for target.
-save-load-state: save-load-state/CMakeFiles/save-load-state.dir/rule
-.PHONY : save-load-state
-
-# fast build rule for target.
-save-load-state/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/build
-.PHONY : save-load-state/fast
-
-# target to build an object file
-save-load-state.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/save-load-state.o
-.PHONY : save-load-state.o
-
-# target to preprocess a source file
-save-load-state.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/save-load-state.i
-.PHONY : save-load-state.i
-
-# target to generate assembly for a file
-save-load-state.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f save-load-state/CMakeFiles/save-load-state.dir/build.make save-load-state/CMakeFiles/save-load-state.dir/save-load-state.s
-.PHONY : save-load-state.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... save-load-state"
-	@echo "... save-load-state.o"
-	@echo "... save-load-state.i"
-	@echo "... save-load-state.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/save-load-state/cmake_install.cmake b/examples/sycl/save-load-state/cmake_install.cmake
deleted file mode 100644
index 6fa691a41e16e..0000000000000
--- a/examples/sycl/save-load-state/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/save-load-state
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/save-load-state/save-load-state")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/save-load-state")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/simple/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/simple/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/simple/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/simple/CMakeFiles/progress.marks b/examples/sycl/simple/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/simple/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake b/examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake
deleted file mode 100644
index 9d85da155f74d..0000000000000
--- a/examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/simple/simple.cpp" "simple/CMakeFiles/simple.dir/simple.o" "gcc" "simple/CMakeFiles/simple.dir/simple.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/build.make b/examples/sycl/simple/CMakeFiles/simple.dir/build.make
deleted file mode 100644
index a6aa4f6338188..0000000000000
--- a/examples/sycl/simple/CMakeFiles/simple.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include simple/CMakeFiles/simple.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include simple/CMakeFiles/simple.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include simple/CMakeFiles/simple.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include simple/CMakeFiles/simple.dir/flags.make
-
-simple/CMakeFiles/simple.dir/simple.o: simple/CMakeFiles/simple.dir/flags.make
-simple/CMakeFiles/simple.dir/simple.o: ../simple/simple.cpp
-simple/CMakeFiles/simple.dir/simple.o: simple/CMakeFiles/simple.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object simple/CMakeFiles/simple.dir/simple.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT simple/CMakeFiles/simple.dir/simple.o -MF CMakeFiles/simple.dir/simple.o.d -o CMakeFiles/simple.dir/simple.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/simple/simple.cpp
-
-simple/CMakeFiles/simple.dir/simple.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/simple.dir/simple.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/simple/simple.cpp > CMakeFiles/simple.dir/simple.i
-
-simple/CMakeFiles/simple.dir/simple.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/simple.dir/simple.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/simple/simple.cpp -o CMakeFiles/simple.dir/simple.s
-
-# Object files for target simple
-simple_OBJECTS = \
-"CMakeFiles/simple.dir/simple.o"
-
-# External object files for target simple
-simple_EXTERNAL_OBJECTS =
-
-simple/simple: simple/CMakeFiles/simple.dir/simple.o
-simple/simple: simple/CMakeFiles/simple.dir/build.make
-simple/simple: simple/CMakeFiles/simple.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable simple"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/simple.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-simple/CMakeFiles/simple.dir/build: simple/simple
-.PHONY : simple/CMakeFiles/simple.dir/build
-
-simple/CMakeFiles/simple.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple && $(CMAKE_COMMAND) -P CMakeFiles/simple.dir/cmake_clean.cmake
-.PHONY : simple/CMakeFiles/simple.dir/clean
-
-simple/CMakeFiles/simple.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/simple /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/CMakeFiles/simple.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : simple/CMakeFiles/simple.dir/depend
-
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/cmake_clean.cmake b/examples/sycl/simple/CMakeFiles/simple.dir/cmake_clean.cmake
deleted file mode 100644
index 554207f3bf60e..0000000000000
--- a/examples/sycl/simple/CMakeFiles/simple.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/simple.dir/simple.o"
-  "CMakeFiles/simple.dir/simple.o.d"
-  "simple"
-  "simple.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/simple.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.make b/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.make
deleted file mode 100644
index 5703239cd22ca..0000000000000
--- a/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for simple.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.ts b/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.ts
deleted file mode 100644
index 3712ddf6ecefc..0000000000000
--- a/examples/sycl/simple/CMakeFiles/simple.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for simple.
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/depend.make b/examples/sycl/simple/CMakeFiles/simple.dir/depend.make
deleted file mode 100644
index 1857af881f09a..0000000000000
--- a/examples/sycl/simple/CMakeFiles/simple.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for simple.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/flags.make b/examples/sycl/simple/CMakeFiles/simple.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/simple/CMakeFiles/simple.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/link.txt b/examples/sycl/simple/CMakeFiles/simple.dir/link.txt
deleted file mode 100644
index c5460183e6dca..0000000000000
--- a/examples/sycl/simple/CMakeFiles/simple.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/simple.dir/simple.o -o simple  -lcommon -lllama 
diff --git a/examples/sycl/simple/CMakeFiles/simple.dir/progress.make b/examples/sycl/simple/CMakeFiles/simple.dir/progress.make
deleted file mode 100644
index 80ac6c176b991..0000000000000
--- a/examples/sycl/simple/CMakeFiles/simple.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 46
-CMAKE_PROGRESS_2 = 47
-
diff --git a/examples/sycl/simple/Makefile b/examples/sycl/simple/Makefile
deleted file mode 100644
index d22ab5997f36e..0000000000000
--- a/examples/sycl/simple/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-simple/CMakeFiles/simple.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 simple/CMakeFiles/simple.dir/rule
-.PHONY : simple/CMakeFiles/simple.dir/rule
-
-# Convenience name for target.
-simple: simple/CMakeFiles/simple.dir/rule
-.PHONY : simple
-
-# fast build rule for target.
-simple/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/build
-.PHONY : simple/fast
-
-# target to build an object file
-simple.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/simple.o
-.PHONY : simple.o
-
-# target to preprocess a source file
-simple.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/simple.i
-.PHONY : simple.i
-
-# target to generate assembly for a file
-simple.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f simple/CMakeFiles/simple.dir/build.make simple/CMakeFiles/simple.dir/simple.s
-.PHONY : simple.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... simple"
-	@echo "... simple.o"
-	@echo "... simple.i"
-	@echo "... simple.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/simple/cmake_install.cmake b/examples/sycl/simple/cmake_install.cmake
deleted file mode 100644
index 22ab3eaa27cb4..0000000000000
--- a/examples/sycl/simple/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/simple
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/simple/simple")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/simple")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/speculative/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/speculative/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/speculative/CMakeFiles/progress.marks b/examples/sycl/speculative/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake b/examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake
deleted file mode 100644
index 9e7f0cddf574b..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/speculative/speculative.cpp" "speculative/CMakeFiles/speculative.dir/speculative.o" "gcc" "speculative/CMakeFiles/speculative.dir/speculative.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/build.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/build.make
deleted file mode 100644
index 64c40f740346c..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/speculative.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include speculative/CMakeFiles/speculative.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include speculative/CMakeFiles/speculative.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include speculative/CMakeFiles/speculative.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include speculative/CMakeFiles/speculative.dir/flags.make
-
-speculative/CMakeFiles/speculative.dir/speculative.o: speculative/CMakeFiles/speculative.dir/flags.make
-speculative/CMakeFiles/speculative.dir/speculative.o: ../speculative/speculative.cpp
-speculative/CMakeFiles/speculative.dir/speculative.o: speculative/CMakeFiles/speculative.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object speculative/CMakeFiles/speculative.dir/speculative.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT speculative/CMakeFiles/speculative.dir/speculative.o -MF CMakeFiles/speculative.dir/speculative.o.d -o CMakeFiles/speculative.dir/speculative.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/speculative/speculative.cpp
-
-speculative/CMakeFiles/speculative.dir/speculative.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/speculative.dir/speculative.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/speculative/speculative.cpp > CMakeFiles/speculative.dir/speculative.i
-
-speculative/CMakeFiles/speculative.dir/speculative.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/speculative.dir/speculative.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/speculative/speculative.cpp -o CMakeFiles/speculative.dir/speculative.s
-
-# Object files for target speculative
-speculative_OBJECTS = \
-"CMakeFiles/speculative.dir/speculative.o"
-
-# External object files for target speculative
-speculative_EXTERNAL_OBJECTS =
-
-speculative/speculative: speculative/CMakeFiles/speculative.dir/speculative.o
-speculative/speculative: speculative/CMakeFiles/speculative.dir/build.make
-speculative/speculative: speculative/CMakeFiles/speculative.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable speculative"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/speculative.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-speculative/CMakeFiles/speculative.dir/build: speculative/speculative
-.PHONY : speculative/CMakeFiles/speculative.dir/build
-
-speculative/CMakeFiles/speculative.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative && $(CMAKE_COMMAND) -P CMakeFiles/speculative.dir/cmake_clean.cmake
-.PHONY : speculative/CMakeFiles/speculative.dir/clean
-
-speculative/CMakeFiles/speculative.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/speculative /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/CMakeFiles/speculative.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : speculative/CMakeFiles/speculative.dir/depend
-
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/cmake_clean.cmake b/examples/sycl/speculative/CMakeFiles/speculative.dir/cmake_clean.cmake
deleted file mode 100644
index 68845ff09127a..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/speculative.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/speculative.dir/speculative.o"
-  "CMakeFiles/speculative.dir/speculative.o.d"
-  "speculative"
-  "speculative.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/speculative.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.make
deleted file mode 100644
index b7621539416da..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for speculative.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.ts b/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.ts
deleted file mode 100644
index 2afc705eca269..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/speculative.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for speculative.
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/depend.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/depend.make
deleted file mode 100644
index 6da57c2baa79f..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/speculative.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for speculative.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/flags.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/speculative.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/link.txt b/examples/sycl/speculative/CMakeFiles/speculative.dir/link.txt
deleted file mode 100644
index 9cfb674c94f2a..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/speculative.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/speculative.dir/speculative.o -o speculative  -lcommon -lllama 
diff --git a/examples/sycl/speculative/CMakeFiles/speculative.dir/progress.make b/examples/sycl/speculative/CMakeFiles/speculative.dir/progress.make
deleted file mode 100644
index 961e1cad8a491..0000000000000
--- a/examples/sycl/speculative/CMakeFiles/speculative.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 48
-CMAKE_PROGRESS_2 = 49
-
diff --git a/examples/sycl/speculative/Makefile b/examples/sycl/speculative/Makefile
deleted file mode 100644
index 713de48f104ff..0000000000000
--- a/examples/sycl/speculative/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-speculative/CMakeFiles/speculative.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 speculative/CMakeFiles/speculative.dir/rule
-.PHONY : speculative/CMakeFiles/speculative.dir/rule
-
-# Convenience name for target.
-speculative: speculative/CMakeFiles/speculative.dir/rule
-.PHONY : speculative
-
-# fast build rule for target.
-speculative/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/build
-.PHONY : speculative/fast
-
-# target to build an object file
-speculative.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/speculative.o
-.PHONY : speculative.o
-
-# target to preprocess a source file
-speculative.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/speculative.i
-.PHONY : speculative.i
-
-# target to generate assembly for a file
-speculative.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f speculative/CMakeFiles/speculative.dir/build.make speculative/CMakeFiles/speculative.dir/speculative.s
-.PHONY : speculative.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... speculative"
-	@echo "... speculative.o"
-	@echo "... speculative.i"
-	@echo "... speculative.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/speculative/cmake_install.cmake b/examples/sycl/speculative/cmake_install.cmake
deleted file mode 100644
index 7c9f489081d36..0000000000000
--- a/examples/sycl/speculative/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/speculative
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/speculative/speculative")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/speculative")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/sycl/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake
deleted file mode 100644
index f86d13476a2fa..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/ls-sycl-device.cpp" "sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o" "gcc" "sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/build.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/build.make
deleted file mode 100644
index 9a8988ea8affd..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include sycl/CMakeFiles/ls-sycl-device.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include sycl/CMakeFiles/ls-sycl-device.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include sycl/CMakeFiles/ls-sycl-device.dir/flags.make
-
-sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o: sycl/CMakeFiles/ls-sycl-device.dir/flags.make
-sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o: ls-sycl-device.cpp
-sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o: sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o -MF CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o.d -o CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/ls-sycl-device.cpp
-
-sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/ls-sycl-device.dir/ls-sycl-device.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/ls-sycl-device.cpp > CMakeFiles/ls-sycl-device.dir/ls-sycl-device.i
-
-sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/ls-sycl-device.dir/ls-sycl-device.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/ls-sycl-device.cpp -o CMakeFiles/ls-sycl-device.dir/ls-sycl-device.s
-
-# Object files for target ls-sycl-device
-ls__sycl__device_OBJECTS = \
-"CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o"
-
-# External object files for target ls-sycl-device
-ls__sycl__device_EXTERNAL_OBJECTS =
-
-sycl/ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o
-sycl/ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/build.make
-sycl/ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable ls-sycl-device"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/ls-sycl-device.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-sycl/CMakeFiles/ls-sycl-device.dir/build: sycl/ls-sycl-device
-.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/build
-
-sycl/CMakeFiles/ls-sycl-device.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl && $(CMAKE_COMMAND) -P CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake
-.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/clean
-
-sycl/CMakeFiles/ls-sycl-device.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/depend
-
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake
deleted file mode 100644
index f7bcb5646f683..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o"
-  "CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o.d"
-  "ls-sycl-device"
-  "ls-sycl-device.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/ls-sycl-device.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make
deleted file mode 100644
index eda69ffb9384d..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for ls-sycl-device.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts
deleted file mode 100644
index c5f1ce2e44e53..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for ls-sycl-device.
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/depend.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/depend.make
deleted file mode 100644
index 911368e45b466..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for ls-sycl-device.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/flags.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/flags.make
deleted file mode 100644
index 931e8c441e8bd..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS =  -std=c++17 -Wno-narrowing -O3 -fsycl -I./ -I//2024.0/include
-
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/link.txt b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/link.txt
deleted file mode 100644
index 44973a40da0f0..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx  -std=c++17 -Wno-narrowing -O3 -fsycl -rdynamic CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o -o ls-sycl-device  -lcommon -lllama 
diff --git a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/progress.make b/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/progress.make
deleted file mode 100644
index e1615c182fcdc..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/ls-sycl-device.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 32
-CMAKE_PROGRESS_2 = 33
-
diff --git a/examples/sycl/sycl/CMakeFiles/progress.marks b/examples/sycl/sycl/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/sycl/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/sycl/Makefile b/examples/sycl/sycl/Makefile
deleted file mode 100644
index b6a09fb7f350b..0000000000000
--- a/examples/sycl/sycl/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-sycl/CMakeFiles/ls-sycl-device.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 sycl/CMakeFiles/ls-sycl-device.dir/rule
-.PHONY : sycl/CMakeFiles/ls-sycl-device.dir/rule
-
-# Convenience name for target.
-ls-sycl-device: sycl/CMakeFiles/ls-sycl-device.dir/rule
-.PHONY : ls-sycl-device
-
-# fast build rule for target.
-ls-sycl-device/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/build
-.PHONY : ls-sycl-device/fast
-
-# target to build an object file
-ls-sycl-device.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.o
-.PHONY : ls-sycl-device.o
-
-# target to preprocess a source file
-ls-sycl-device.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.i
-.PHONY : ls-sycl-device.i
-
-# target to generate assembly for a file
-ls-sycl-device.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f sycl/CMakeFiles/ls-sycl-device.dir/build.make sycl/CMakeFiles/ls-sycl-device.dir/ls-sycl-device.s
-.PHONY : ls-sycl-device.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... ls-sycl-device"
-	@echo "... ls-sycl-device.o"
-	@echo "... ls-sycl-device.i"
-	@echo "... ls-sycl-device.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/sycl/cmake_install.cmake b/examples/sycl/sycl/cmake_install.cmake
deleted file mode 100644
index 17c549fd9d29c..0000000000000
--- a/examples/sycl/sycl/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/sycl/ls-sycl-device")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/ls-sycl-device")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/tokenize/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/tokenize/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/tokenize/CMakeFiles/progress.marks b/examples/sycl/tokenize/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake
deleted file mode 100644
index f887be24f00a7..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/tokenize/tokenize.cpp" "tokenize/CMakeFiles/tokenize.dir/tokenize.o" "gcc" "tokenize/CMakeFiles/tokenize.dir/tokenize.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/build.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/build.make
deleted file mode 100644
index 1164bb382d6e2..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include tokenize/CMakeFiles/tokenize.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include tokenize/CMakeFiles/tokenize.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include tokenize/CMakeFiles/tokenize.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include tokenize/CMakeFiles/tokenize.dir/flags.make
-
-tokenize/CMakeFiles/tokenize.dir/tokenize.o: tokenize/CMakeFiles/tokenize.dir/flags.make
-tokenize/CMakeFiles/tokenize.dir/tokenize.o: ../tokenize/tokenize.cpp
-tokenize/CMakeFiles/tokenize.dir/tokenize.o: tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object tokenize/CMakeFiles/tokenize.dir/tokenize.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT tokenize/CMakeFiles/tokenize.dir/tokenize.o -MF CMakeFiles/tokenize.dir/tokenize.o.d -o CMakeFiles/tokenize.dir/tokenize.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize/tokenize.cpp
-
-tokenize/CMakeFiles/tokenize.dir/tokenize.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/tokenize.dir/tokenize.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize/tokenize.cpp > CMakeFiles/tokenize.dir/tokenize.i
-
-tokenize/CMakeFiles/tokenize.dir/tokenize.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/tokenize.dir/tokenize.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize/tokenize.cpp -o CMakeFiles/tokenize.dir/tokenize.s
-
-# Object files for target tokenize
-tokenize_OBJECTS = \
-"CMakeFiles/tokenize.dir/tokenize.o"
-
-# External object files for target tokenize
-tokenize_EXTERNAL_OBJECTS =
-
-tokenize/tokenize: tokenize/CMakeFiles/tokenize.dir/tokenize.o
-tokenize/tokenize: tokenize/CMakeFiles/tokenize.dir/build.make
-tokenize/tokenize: tokenize/CMakeFiles/tokenize.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable tokenize"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/tokenize.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-tokenize/CMakeFiles/tokenize.dir/build: tokenize/tokenize
-.PHONY : tokenize/CMakeFiles/tokenize.dir/build
-
-tokenize/CMakeFiles/tokenize.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize && $(CMAKE_COMMAND) -P CMakeFiles/tokenize.dir/cmake_clean.cmake
-.PHONY : tokenize/CMakeFiles/tokenize.dir/clean
-
-tokenize/CMakeFiles/tokenize.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/CMakeFiles/tokenize.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : tokenize/CMakeFiles/tokenize.dir/depend
-
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/cmake_clean.cmake b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/cmake_clean.cmake
deleted file mode 100644
index 1f3463088133d..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/tokenize.dir/tokenize.o"
-  "CMakeFiles/tokenize.dir/tokenize.o.d"
-  "tokenize"
-  "tokenize.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/tokenize.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.make
deleted file mode 100644
index 13f791178b843..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for tokenize.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts
deleted file mode 100644
index 0900c89c764cc..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for tokenize.
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/depend.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/depend.make
deleted file mode 100644
index e1f613484cfb7..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for tokenize.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/flags.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/link.txt b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/link.txt
deleted file mode 100644
index a0037536c35a1..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/tokenize.dir/tokenize.o -o tokenize  -lcommon -lllama 
diff --git a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/progress.make b/examples/sycl/tokenize/CMakeFiles/tokenize.dir/progress.make
deleted file mode 100644
index dbabe1f503005..0000000000000
--- a/examples/sycl/tokenize/CMakeFiles/tokenize.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 50
-CMAKE_PROGRESS_2 = 51
-
diff --git a/examples/sycl/tokenize/Makefile b/examples/sycl/tokenize/Makefile
deleted file mode 100644
index 19d5b91c6bd62..0000000000000
--- a/examples/sycl/tokenize/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-tokenize/CMakeFiles/tokenize.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 tokenize/CMakeFiles/tokenize.dir/rule
-.PHONY : tokenize/CMakeFiles/tokenize.dir/rule
-
-# Convenience name for target.
-tokenize: tokenize/CMakeFiles/tokenize.dir/rule
-.PHONY : tokenize
-
-# fast build rule for target.
-tokenize/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/build
-.PHONY : tokenize/fast
-
-# target to build an object file
-tokenize.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/tokenize.o
-.PHONY : tokenize.o
-
-# target to preprocess a source file
-tokenize.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/tokenize.i
-.PHONY : tokenize.i
-
-# target to generate assembly for a file
-tokenize.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f tokenize/CMakeFiles/tokenize.dir/build.make tokenize/CMakeFiles/tokenize.dir/tokenize.s
-.PHONY : tokenize.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... tokenize"
-	@echo "... tokenize.o"
-	@echo "... tokenize.i"
-	@echo "... tokenize.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/tokenize/cmake_install.cmake b/examples/sycl/tokenize/cmake_install.cmake
deleted file mode 100644
index a3c2dfb59a705..0000000000000
--- a/examples/sycl/tokenize/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/tokenize
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/tokenize/tokenize")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/tokenize")
-    endif()
-  endif()
-endif()
-
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake b/examples/sycl/train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake
deleted file mode 100644
index 5c593a47325e1..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/CMakeDirectoryInformation.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Relative path conversion top directories.
-set(CMAKE_RELATIVE_PATH_TOP_SOURCE "/home/jianyuzh/ws/llama.cpp/develop/examples")
-set(CMAKE_RELATIVE_PATH_TOP_BINARY "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl")
-
-# Force unix paths in dependencies.
-set(CMAKE_FORCE_UNIX_PATHS 1)
-
-
-# The C and CXX include file regular expressions for this directory.
-set(CMAKE_C_INCLUDE_REGEX_SCAN "^.*$")
-set(CMAKE_C_INCLUDE_REGEX_COMPLAIN "^$")
-set(CMAKE_CXX_INCLUDE_REGEX_SCAN ${CMAKE_C_INCLUDE_REGEX_SCAN})
-set(CMAKE_CXX_INCLUDE_REGEX_COMPLAIN ${CMAKE_C_INCLUDE_REGEX_COMPLAIN})
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/progress.marks b/examples/sycl/train-text-from-scratch/CMakeFiles/progress.marks
deleted file mode 100644
index 0cfbf08886fca..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/progress.marks
+++ /dev/null
@@ -1 +0,0 @@
-2
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake
deleted file mode 100644
index b6611153346d0..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake
+++ /dev/null
@@ -1,19 +0,0 @@
-
-# Consider dependencies only in project.
-set(CMAKE_DEPENDS_IN_PROJECT_ONLY OFF)
-
-# The set of languages for which implicit dependencies are needed:
-set(CMAKE_DEPENDS_LANGUAGES
-  )
-
-# The set of dependency files which are needed:
-set(CMAKE_DEPENDS_DEPENDENCY_FILES
-  "/home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch/train-text-from-scratch.cpp" "train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o" "gcc" "train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o.d"
-  )
-
-# Targets to which this target links.
-set(CMAKE_TARGET_LINKED_INFO_FILES
-  )
-
-# Fortran module output directory.
-set(CMAKE_Fortran_TARGET_MODULE_DIR "")
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make
deleted file mode 100644
index 5c57f36d5bf74..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make
+++ /dev/null
@@ -1,110 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Delete rule output on recipe failure.
-.DELETE_ON_ERROR:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-# Include any dependencies generated for this target.
-include train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make
-# Include any dependencies generated by the compiler for this target.
-include train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make
-
-# Include the progress variables for this target.
-include train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make
-
-# Include the compile flags for this target's objects.
-include train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
-
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o: ../train-text-from-scratch/train-text-from-scratch.cpp
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_1) "Building CXX object train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -MD -MT train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o -MF CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o.d -o CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o -c /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch/train-text-from-scratch.cpp
-
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.i: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Preprocessing CXX source to CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.i"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -E /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch/train-text-from-scratch.cpp > CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.i
-
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.s: cmake_force
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green "Compiling CXX source to assembly CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.s"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && /opt/intel/oneapi/compiler/2024.0/bin/icpx $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -S /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch/train-text-from-scratch.cpp -o CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.s
-
-# Object files for target train-text-from-scratch
-train__text__from__scratch_OBJECTS = \
-"CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o"
-
-# External object files for target train-text-from-scratch
-train__text__from__scratch_EXTERNAL_OBJECTS =
-
-train-text-from-scratch/train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o
-train-text-from-scratch/train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make
-train-text-from-scratch/train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --green --bold --progress-dir=/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles --progress-num=$(CMAKE_PROGRESS_2) "Linking CXX executable train-text-from-scratch"
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && $(CMAKE_COMMAND) -E cmake_link_script CMakeFiles/train-text-from-scratch.dir/link.txt --verbose=$(VERBOSE)
-
-# Rule to build all files generated by this target.
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build: train-text-from-scratch/train-text-from-scratch
-.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build
-
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch && $(CMAKE_COMMAND) -P CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake
-.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/clean
-
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_depends "Unix Makefiles" /home/jianyuzh/ws/llama.cpp/develop/examples /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch /home/jianyuzh/ws/llama.cpp/develop/examples/sycl /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/DependInfo.cmake --color=$(COLOR)
-.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend
-
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake
deleted file mode 100644
index 05bfb17e3ae10..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/cmake_clean.cmake
+++ /dev/null
@@ -1,11 +0,0 @@
-file(REMOVE_RECURSE
-  "CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o"
-  "CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o.d"
-  "train-text-from-scratch"
-  "train-text-from-scratch.pdb"
-)
-
-# Per-language clean rules from dependency scanning.
-foreach(lang CXX)
-  include(CMakeFiles/train-text-from-scratch.dir/cmake_clean_${lang}.cmake OPTIONAL)
-endforeach()
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make
deleted file mode 100644
index e9531edfa6d92..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty compiler generated dependencies file for train-text-from-scratch.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts
deleted file mode 100644
index 27df3e1de02f5..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/compiler_depend.ts
+++ /dev/null
@@ -1,2 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Timestamp file for compiler generated dependencies management for train-text-from-scratch.
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make
deleted file mode 100644
index 4aa3cbd93ed4d..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/depend.make
+++ /dev/null
@@ -1,2 +0,0 @@
-# Empty dependencies file for train-text-from-scratch.
-# This may be replaced when dependencies are built.
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
deleted file mode 100644
index 6bddd3d69b220..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/flags.make
+++ /dev/null
@@ -1,10 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# compile CXX with /opt/intel/oneapi/compiler/2024.0/bin/icpx
-CXX_DEFINES = 
-
-CXX_INCLUDES = -I/home/jianyuzh/ws/llama.cpp/develop/examples
-
-CXX_FLAGS = 
-
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt
deleted file mode 100644
index e6e1f8bf105b5..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/link.txt
+++ /dev/null
@@ -1 +0,0 @@
-/opt/intel/oneapi/compiler/2024.0/bin/icpx -rdynamic CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o -o train-text-from-scratch  -lcommon -lllama 
diff --git a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make b/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make
deleted file mode 100644
index 3b1f03a283674..0000000000000
--- a/examples/sycl/train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/progress.make
+++ /dev/null
@@ -1,3 +0,0 @@
-CMAKE_PROGRESS_1 = 52
-CMAKE_PROGRESS_2 = 53
-
diff --git a/examples/sycl/train-text-from-scratch/Makefile b/examples/sycl/train-text-from-scratch/Makefile
deleted file mode 100644
index 6b6a274f953ab..0000000000000
--- a/examples/sycl/train-text-from-scratch/Makefile
+++ /dev/null
@@ -1,222 +0,0 @@
-# CMAKE generated file: DO NOT EDIT!
-# Generated by "Unix Makefiles" Generator, CMake Version 3.22
-
-# Default target executed when no arguments are given to make.
-default_target: all
-.PHONY : default_target
-
-# Allow only one "make -f Makefile2" at a time, but pass parallelism.
-.NOTPARALLEL:
-
-#=============================================================================
-# Special targets provided by cmake.
-
-# Disable implicit rules so canonical targets will work.
-.SUFFIXES:
-
-# Disable VCS-based implicit rules.
-% : %,v
-
-# Disable VCS-based implicit rules.
-% : RCS/%
-
-# Disable VCS-based implicit rules.
-% : RCS/%,v
-
-# Disable VCS-based implicit rules.
-% : SCCS/s.%
-
-# Disable VCS-based implicit rules.
-% : s.%
-
-.SUFFIXES: .hpux_make_needs_suffix_list
-
-# Command-line flag to silence nested $(MAKE).
-$(VERBOSE)MAKESILENT = -s
-
-#Suppress display of executed commands.
-$(VERBOSE).SILENT:
-
-# A target that is always out of date.
-cmake_force:
-.PHONY : cmake_force
-
-#=============================================================================
-# Set environment variables for the build.
-
-# The shell in which to execute make rules.
-SHELL = /bin/sh
-
-# The CMake executable.
-CMAKE_COMMAND = /usr/bin/cmake
-
-# The command to remove a file.
-RM = /usr/bin/cmake -E rm -f
-
-# Escaping for special characters.
-EQUALS = =
-
-# The top-level source directory on which CMake was run.
-CMAKE_SOURCE_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples
-
-# The top-level build directory on which CMake was run.
-CMAKE_BINARY_DIR = /home/jianyuzh/ws/llama.cpp/develop/examples/sycl
-
-#=============================================================================
-# Targets provided globally by CMake.
-
-# Special rule for the target edit_cache
-edit_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "No interactive CMake dialog available..."
-	/usr/bin/cmake -E echo No\ interactive\ CMake\ dialog\ available.
-.PHONY : edit_cache
-
-# Special rule for the target edit_cache
-edit_cache/fast: edit_cache
-.PHONY : edit_cache/fast
-
-# Special rule for the target rebuild_cache
-rebuild_cache:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Running CMake to regenerate build system..."
-	/usr/bin/cmake --regenerate-during-build -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR)
-.PHONY : rebuild_cache
-
-# Special rule for the target rebuild_cache
-rebuild_cache/fast: rebuild_cache
-.PHONY : rebuild_cache/fast
-
-# Special rule for the target list_install_components
-list_install_components:
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Available install components are: \"Unspecified\""
-.PHONY : list_install_components
-
-# Special rule for the target list_install_components
-list_install_components/fast: list_install_components
-.PHONY : list_install_components/fast
-
-# Special rule for the target install
-install: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install
-
-# Special rule for the target install
-install/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Install the project..."
-	/usr/bin/cmake -P cmake_install.cmake
-.PHONY : install/fast
-
-# Special rule for the target install/local
-install/local: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local
-
-# Special rule for the target install/local
-install/local/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing only the local directory..."
-	/usr/bin/cmake -DCMAKE_INSTALL_LOCAL_ONLY=1 -P cmake_install.cmake
-.PHONY : install/local/fast
-
-# Special rule for the target install/strip
-install/strip: preinstall
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip
-
-# Special rule for the target install/strip
-install/strip/fast: preinstall/fast
-	@$(CMAKE_COMMAND) -E cmake_echo_color --switch=$(COLOR) --cyan "Installing the project stripped..."
-	/usr/bin/cmake -DCMAKE_INSTALL_DO_STRIP=1 -P cmake_install.cmake
-.PHONY : install/strip/fast
-
-# The main all target
-all: cmake_check_build_system
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch//CMakeFiles/progress.marks
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/all
-	$(CMAKE_COMMAND) -E cmake_progress_start /home/jianyuzh/ws/llama.cpp/develop/examples/sycl/CMakeFiles 0
-.PHONY : all
-
-# The main clean target
-clean:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/clean
-.PHONY : clean
-
-# The main clean target
-clean/fast: clean
-.PHONY : clean/fast
-
-# Prepare targets for installation.
-preinstall: all
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/preinstall
-.PHONY : preinstall
-
-# Prepare targets for installation.
-preinstall/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/preinstall
-.PHONY : preinstall/fast
-
-# clear depends
-depend:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 1
-.PHONY : depend
-
-# Convenience name for target.
-train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f CMakeFiles/Makefile2 train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
-.PHONY : train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
-
-# Convenience name for target.
-train-text-from-scratch: train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/rule
-.PHONY : train-text-from-scratch
-
-# fast build rule for target.
-train-text-from-scratch/fast:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build
-.PHONY : train-text-from-scratch/fast
-
-# target to build an object file
-train-text-from-scratch.o:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.o
-.PHONY : train-text-from-scratch.o
-
-# target to preprocess a source file
-train-text-from-scratch.i:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.i
-.PHONY : train-text-from-scratch.i
-
-# target to generate assembly for a file
-train-text-from-scratch.s:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(MAKE) $(MAKESILENT) -f train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/build.make train-text-from-scratch/CMakeFiles/train-text-from-scratch.dir/train-text-from-scratch.s
-.PHONY : train-text-from-scratch.s
-
-# Help Target
-help:
-	@echo "The following are some of the valid targets for this Makefile:"
-	@echo "... all (the default if no target is provided)"
-	@echo "... clean"
-	@echo "... depend"
-	@echo "... edit_cache"
-	@echo "... install"
-	@echo "... install/local"
-	@echo "... install/strip"
-	@echo "... list_install_components"
-	@echo "... rebuild_cache"
-	@echo "... train-text-from-scratch"
-	@echo "... train-text-from-scratch.o"
-	@echo "... train-text-from-scratch.i"
-	@echo "... train-text-from-scratch.s"
-.PHONY : help
-
-
-
-#=============================================================================
-# Special targets to cleanup operation of make.
-
-# Special rule to run CMake to check the build system integrity.
-# No rule that depends on this can have commands that come from listfiles
-# because they might be regenerated.
-cmake_check_build_system:
-	cd /home/jianyuzh/ws/llama.cpp/develop/examples/sycl && $(CMAKE_COMMAND) -S$(CMAKE_SOURCE_DIR) -B$(CMAKE_BINARY_DIR) --check-build-system CMakeFiles/Makefile.cmake 0
-.PHONY : cmake_check_build_system
-
diff --git a/examples/sycl/train-text-from-scratch/cmake_install.cmake b/examples/sycl/train-text-from-scratch/cmake_install.cmake
deleted file mode 100644
index ea9a267286952..0000000000000
--- a/examples/sycl/train-text-from-scratch/cmake_install.cmake
+++ /dev/null
@@ -1,60 +0,0 @@
-# Install script for directory: /home/jianyuzh/ws/llama.cpp/develop/examples/train-text-from-scratch
-
-# Set the install prefix
-if(NOT DEFINED CMAKE_INSTALL_PREFIX)
-  set(CMAKE_INSTALL_PREFIX "/usr/local")
-endif()
-string(REGEX REPLACE "/$" "" CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX}")
-
-# Set the install configuration name.
-if(NOT DEFINED CMAKE_INSTALL_CONFIG_NAME)
-  if(BUILD_TYPE)
-    string(REGEX REPLACE "^[^A-Za-z0-9_]+" ""
-           CMAKE_INSTALL_CONFIG_NAME "${BUILD_TYPE}")
-  else()
-    set(CMAKE_INSTALL_CONFIG_NAME "")
-  endif()
-  message(STATUS "Install configuration: \"${CMAKE_INSTALL_CONFIG_NAME}\"")
-endif()
-
-# Set the component getting installed.
-if(NOT CMAKE_INSTALL_COMPONENT)
-  if(COMPONENT)
-    message(STATUS "Install component: \"${COMPONENT}\"")
-    set(CMAKE_INSTALL_COMPONENT "${COMPONENT}")
-  else()
-    set(CMAKE_INSTALL_COMPONENT)
-  endif()
-endif()
-
-# Install shared libraries without execute permission?
-if(NOT DEFINED CMAKE_INSTALL_SO_NO_EXE)
-  set(CMAKE_INSTALL_SO_NO_EXE "1")
-endif()
-
-# Is this installation the result of a crosscompile?
-if(NOT DEFINED CMAKE_CROSSCOMPILING)
-  set(CMAKE_CROSSCOMPILING "FALSE")
-endif()
-
-# Set default install directory permissions.
-if(NOT DEFINED CMAKE_OBJDUMP)
-  set(CMAKE_OBJDUMP "/usr/bin/objdump")
-endif()
-
-if("x${CMAKE_INSTALL_COMPONENT}x" STREQUAL "xUnspecifiedx" OR NOT CMAKE_INSTALL_COMPONENT)
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch")
-    file(RPATH_CHECK
-         FILE "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch"
-         RPATH "")
-  endif()
-  file(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/bin" TYPE EXECUTABLE FILES "/home/jianyuzh/ws/llama.cpp/develop/examples/sycl/train-text-from-scratch/train-text-from-scratch")
-  if(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch" AND
-     NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch")
-    if(CMAKE_INSTALL_DO_STRIP)
-      execute_process(COMMAND "/usr/bin/strip" "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/bin/train-text-from-scratch")
-    endif()
-  endif()
-endif()
-

From 09b5619df440a50d5c54252d1d155a8403bfc6d5 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 16:05:47 +0800
Subject: [PATCH 28/90] rm rear space

---
 examples/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e9c738985fd4a..6b1d24e42d169 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -35,7 +35,7 @@ else()
     add_subdirectory(passkey)
     add_subdirectory(speculative)
     add_subdirectory(lookahead)
-    add_subdirectory(lookup)    
+    add_subdirectory(lookup)
     add_subdirectory(train-text-from-scratch)
     add_subdirectory(imatrix)
     if (LLAMA_BUILD_SERVER)

From d80dd65f42ecb48e814ab81735c6c37c15bd35d2 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Mon, 15 Jan 2024 16:07:31 +0800
Subject: [PATCH 29/90] dos2unix

---
 README_sycl.md | 420 ++++++++++++++++++++++++-------------------------
 1 file changed, 210 insertions(+), 210 deletions(-)

diff --git a/README_sycl.md b/README_sycl.md
index ddd7de3fcf3f9..26c925fe1a8bf 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -1,211 +1,211 @@
-# llama.cpp for SYCL
-
-## Background
-
-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
-
-oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
-
-Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
-
-
-## llama.cpp for SYCL
-
-To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
-
-The llama.cpp for SYCL is used to support Intel GPUs.
-
-For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
-
-## OS
-
-|OS|Status|Verified|
-|-|-|-|
-|Linux|Support|Ubuntu 22.04|
-|Windows|Ongoing| |
-
-
-## Intel GPU
-
-|Intel GPU| Status | Verified Model|
-|-|-|-|
-|Intel Data Center Max Series| Support| Max 1550|
-|Intel Data Center Flex Series| Support| Flex 170|
-|Intel Arc Series| Support| Arc 770|
-|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
-|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
-
-
-## Linux
-
-### Setup Environment
-
-1. Install Intel GPU driver.
-
-a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
-
-Note: for iGPU, please install the client GPU driver.
-
-b. Add user to group: video, render.
-
-```
-sudo usermod -aG render username
-sudo usermod -aG video username
-```
-
-Note: re-login to enable it.
-
-c. Check
-
-```
-sudo apt install clinfo
-sudo clinfo -l
-```
-
-Output (example):
-
-```
-Platform #0: Intel(R) OpenCL Graphics
- `-- Device #0: Intel(R) Arc(TM) A770 Graphics
-
-
-Platform #0: Intel(R) OpenCL HD Graphics
- `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
-```
-
-2. Install Intel® oneAPI Base toolkit.
-
-
-a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ] (https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
-
-Recommend to install to default folder: **/opt/intel/oneapi**.
-
-Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
-
-b. Check
-
-```
-source /opt/intel/oneapi/setvars.sh
-
-sycl-ls
-```
-
-There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
-
-Output (example):
-```
-[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
-[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
-[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
-[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
-
-```
-
-2. Build locally:
-
-```
-mkdir -p build
-cd build
-source /opt/intel/oneapi/setvars.sh
-
-#for FP16
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-#build example/main only
-#cmake --build . --config Release --target main
-
-#build all binary
-cmake --build . --config Release -v
-
-```
-
-or
-
-```
-./sycl_build.sh
-```
-
-### Run
-
-1. Put model file to folder **models**
-
-2. Enable oneAPI running environment
-
-```
-source /opt/intel/oneapi/setvars.sh
-```
-
-3. List device ID
-
-Run without parameter:
-
-```
-./build/bin/ls-sycl-device
-
-or
-
-./build/bin/main
-```
-
-Check the ID in startup log, like:
-
-```
-found 4 SYCL devices:
-  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-	max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
-  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-	max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
-  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
-
-```
-
-|Attribute|Note|
-|-|-|
-|compute capability 1.3|Level-zero running time, recommended |
-|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
-
-4. Set device ID and execute llama.cpp
-
-Set device ID = 0 by **GGML_SYCL_DEVICE=0**
-
-```
-GGML_SYCL_DEVICE=0 && ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
-```
-or run by script:
-
-```
-./examples/sycl_run_llama2.sh
-```
-
-5. Check the device ID in output
-
-Like：
-```
-Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
-```
-
-
-### Environment Variable
-
-#### Build
-
-|Name|Value|Function|
-|-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path|
-|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
-|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
-|GGML_SYCL_F16|OFF (default) or ON|Enable FP16 in computing|
-
-
-#### Running
-
-
-|Name|Value|Function|
-|-|-|-|
-|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
+# llama.cpp for SYCL
+
+## Background
+
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+
+oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.
+
+Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
+
+
+## llama.cpp for SYCL
+
+To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+
+The llama.cpp for SYCL is used to support Intel GPUs.
+
+For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
+
+## OS
+
+|OS|Status|Verified|
+|-|-|-|
+|Linux|Support|Ubuntu 22.04|
+|Windows|Ongoing| |
+
+
+## Intel GPU
+
+|Intel GPU| Status | Verified Model|
+|-|-|-|
+|Intel Data Center Max Series| Support| Max 1550|
+|Intel Data Center Flex Series| Support| Flex 170|
+|Intel Arc Series| Support| Arc 770|
+|Intel built-in Arc GPU| Support| built-in Arc GPU in Meteor Lake|
+|Intel iGPU| Support| iGPU in i5-1250P, i7-1165G7|
+
+
+## Linux
+
+### Setup Environment
+
+1. Install Intel GPU driver.
+
+a. Please install Intel GPU driver by official guide: [Install GPU Drivers](https://dgpu-docs.intel.com/driver/installation.html).
+
+Note: for iGPU, please install the client GPU driver.
+
+b. Add user to group: video, render.
+
+```
+sudo usermod -aG render username
+sudo usermod -aG video username
+```
+
+Note: re-login to enable it.
+
+c. Check
+
+```
+sudo apt install clinfo
+sudo clinfo -l
+```
+
+Output (example):
+
+```
+Platform #0: Intel(R) OpenCL Graphics
+ `-- Device #0: Intel(R) Arc(TM) A770 Graphics
+
+
+Platform #0: Intel(R) OpenCL HD Graphics
+ `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
+```
+
+2. Install Intel® oneAPI Base toolkit.
+
+
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ] (https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+
+Recommend to install to default folder: **/opt/intel/oneapi**.
+
+Following guide use the default folder as example. If you use other folder, please modify the following guide info with your folder.
+
+b. Check
+
+```
+source /opt/intel/oneapi/setvars.sh
+
+sycl-ls
+```
+
+There should be one or more level-zero devices. Like **[ext_oneapi_level_zero:gpu:0]**.
+
+Output (example):
+```
+[opencl:acc:0] Intel(R) FPGA Emulation Platform for OpenCL(TM), Intel(R) FPGA Emulation Device OpenCL 1.2  [2023.16.10.0.17_160000]
+[opencl:cpu:1] Intel(R) OpenCL, 13th Gen Intel(R) Core(TM) i7-13700K OpenCL 3.0 (Build 0) [2023.16.10.0.17_160000]
+[opencl:gpu:2] Intel(R) OpenCL Graphics, Intel(R) Arc(TM) A770 Graphics OpenCL 3.0 NEO  [23.30.26918.50]
+[ext_oneapi_level_zero:gpu:0] Intel(R) Level-Zero, Intel(R) Arc(TM) A770 Graphics 1.3 [1.3.26918]
+
+```
+
+2. Build locally:
+
+```
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
+
+```
+
+or
+
+```
+./sycl_build.sh
+```
+
+### Run
+
+1. Put model file to folder **models**
+
+2. Enable oneAPI running environment
+
+```
+source /opt/intel/oneapi/setvars.sh
+```
+
+3. List device ID
+
+Run without parameter:
+
+```
+./build/bin/ls-sycl-device
+
+or
+
+./build/bin/main
+```
+
+Check the ID in startup log, like:
+
+```
+found 4 SYCL devices:
+  Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
+	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+  Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
+	max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+  Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
+	max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+  Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
+	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+
+```
+
+|Attribute|Note|
+|-|-|
+|compute capability 1.3|Level-zero running time, recommended |
+|compute capability 3.0|OpenCL running time, slower than level-zero in most cases|
+
+4. Set device ID and execute llama.cpp
+
+Set device ID = 0 by **GGML_SYCL_DEVICE=0**
+
+```
+GGML_SYCL_DEVICE=0 && ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+```
+or run by script:
+
+```
+./examples/sycl_run_llama2.sh
+```
+
+5. Check the device ID in output
+
+Like：
+```
+Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
+```
+
+
+### Environment Variable
+
+#### Build
+
+|Name|Value|Function|
+|-|-|-|
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path|
+|CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
+|CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
+|GGML_SYCL_F16|OFF (default) or ON|Enable FP16 in computing|
+
+
+#### Running
+
+
+|Name|Value|Function|
+|-|-|-|
+|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
\ No newline at end of file

From 593ce001e2671b0cbf4abfacc0dc92960c0656ee Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Thu, 18 Jan 2024 10:58:56 +0800
Subject: [PATCH 30/90] Update README_sycl.md

---
 README_sycl.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README_sycl.md b/README_sycl.md
index 26c925fe1a8bf..df84e99b2049e 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -76,7 +76,7 @@ Platform #0: Intel(R) OpenCL HD Graphics
 2. Install Intel® oneAPI Base toolkit.
 
 
-a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ] (https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
 
 Recommend to install to default folder: **/opt/intel/oneapi**.
 
@@ -208,4 +208,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 |Name|Value|Function|
 |-|-|-|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
-|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
\ No newline at end of file
+|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|

From 57e9fbadb268cc294864e68d6cc59b8a15e76ca9 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Thu, 18 Jan 2024 15:23:27 +0800
Subject: [PATCH 31/90] fix return type

---
 ggml-sycl.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 2e6e686505232..4de78a7666c28 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -12005,7 +12005,7 @@ static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_ba
     UNUSED(plan);
 }
 
-static void ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
     ggml_backend_context_sycl * sycl_ctx = (ggml_backend_context_sycl *)backend->context;
 
     ggml_sycl_set_main_device(sycl_ctx->device);
@@ -12062,6 +12062,7 @@ static void ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
     }
 
     UNUSED(backend);
+    return;
 }
 
 static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {

From d5f7d364f6f8bb43b7a0c01587fc16bc194c3527 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Thu, 18 Jan 2024 16:37:25 +0800
Subject: [PATCH 32/90] remove sycl version from include path

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d355687702c05..035c66c08599e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -460,7 +460,7 @@ if (LLAMA_SYCL)
     add_compile_definitions(GGML_USE_SYCL)
 
     add_compile_options(-I./) #include DPCT 
-    add_compile_options(-I/${ONEAPI_ROOT}/2024.0/include)
+    add_compile_options(-I/${SYCL_INCLUDE_DIR})
 
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -Wno-narrowing")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")

From 35a0daaaa1a0ba4af6b1d6ac0dfe3c9d9c062443 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Thu, 18 Jan 2024 20:13:58 +0800
Subject: [PATCH 33/90] restore rm code to fix hang issue

---
 ggml.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/ggml.c b/ggml.c
index d59af30b30691..67c791b27dad6 100644
--- a/ggml.c
+++ b/ggml.c
@@ -16570,6 +16570,28 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
                 //n_tasks = MIN(n_threads, MAX(1, nr0/128));
                 //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
+
+#if defined(GGML_USE_CUBLAS)
+                if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
+                    n_tasks = 1; // TODO: this actually is doing nothing
+                                 //       the threads are still spinning
+                }
+#elif defined(GGML_USE_CLBLAST)
+                if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
+                    n_tasks = 1; // TODO: this actually is doing nothing
+                                 //       the threads are still spinning
+                }
+#elif defined(GGML_USE_SYCL)
+                if (ggml_sycl_can_mul_mat(node->src[0], node->src[1], node)) {
+                    n_tasks = 1;
+                }
+#endif
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
+                if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
+                    n_tasks = 1; // TODO: this actually is doing nothing
+                                 //       the threads are still spinning
+                }
+#endif
             } break;
         case GGML_OP_MUL_MAT_ID:
             {

From ae941b1b57e446fd162ca8afac4b4ed5ed0ab569 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Fri, 19 Jan 2024 09:52:04 +0800
Subject: [PATCH 34/90] add syc and link for sycl readme

---
 README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cbfba01bc4b62..8a0b363409410 100644
--- a/README.md
+++ b/README.md
@@ -63,7 +63,7 @@ The main goal of `llama.cpp` is to run the LLaMA model using 4-bit integer quant
 - AVX, AVX2 and AVX512 support for x86 architectures
 - Mixed F16 / F32 precision
 - 2-bit, 3-bit, 4-bit, 5-bit, 6-bit and 8-bit integer quantization support
-- CUDA, Metal and OpenCL GPU backend support
+- CUDA, Metal, OpenCL, SYCL GPU backend support
 
 The original implementation of `llama.cpp` was [hacked in an evening](https://github.com/ggerganov/llama.cpp/issues/33#issuecomment-1465108022).
 Since then, the project has improved significantly thanks to many contributions. This project is mainly for educational purposes and serves
@@ -597,6 +597,15 @@ Building the program with BLAS support may lead to some performance improvements
 
   You can get a list of platforms and devices from the `clinfo -l` command, etc.
 
+- #### SYCL
+
+  SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators.
+
+  llama.cpp based on SYCL is used to support Intel GPU (Data Center Max series, Flex series, Arc series, Built-in GPU and iGPU).
+
+  For detailed info, please refer to [llama.cpp for SYCL](README_sycl.md).
+
+
 ### Prepare Data & Run
 
 ```bash

From e3481faa2f3bee434f95aaa566203421f9c5f222 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Fri, 19 Jan 2024 09:53:48 +0800
Subject: [PATCH 35/90] rm original sycl code before refactor

---
 ggml-sycl.cpp.base | 11951 -------------------------------------------
 1 file changed, 11951 deletions(-)
 delete mode 100644 ggml-sycl.cpp.base

diff --git a/ggml-sycl.cpp.base b/ggml-sycl.cpp.base
deleted file mode 100644
index ab7af226f6891..0000000000000
--- a/ggml-sycl.cpp.base
+++ /dev/null
@@ -1,11951 +0,0 @@
-#define DPCT_COMPAT_RT_VERSION 12010
-#include <sycl/sycl.hpp>
-#include <dpct/dpct.hpp>
-#include <algorithm>
-#include <assert.h>
-#include <atomic>
-#include <cinttypes>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <float.h>
-#include <limits>
-#include <stdint.h>
-#include <stdio.h>
-#include <vector>
-#include <dpct/blas_utils.hpp>
-
-#if defined(GGML_USE_HIPBLAS)
-#include <hip/hip_runtime.h>
-#include <hipblas/hipblas.h>
-#include <hip/hip_fp16.h>
-#ifdef __HIP_PLATFORM_AMD__
-// for rocblas_initialize()
-#include "rocblas/rocblas.h"
-#endif // __HIP_PLATFORM_AMD__
-#define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
-#define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
-#define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
-#define CUBLAS_GEMM_DEFAULT HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_GEMM_DEFAULT_TENSOR_OP HIPBLAS_GEMM_DEFAULT
-#define CUBLAS_OP_N HIPBLAS_OP_N
-#define CUBLAS_OP_T HIPBLAS_OP_T
-#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
-#define CUBLAS_TF32_TENSOR_OP_MATH 0
-#define CUDA_R_16F  HIPBLAS_R_16F
-#define CUDA_R_32F  HIPBLAS_R_32F
-#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
-#define cublasCreate hipblasCreate
-#define cublasGemmEx hipblasGemmEx
-#define cublasGemmBatchedEx hipblasGemmBatchedEx
-#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx
-#define cublasHandle_t hipblasHandle_t
-#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
-#define cublasSetStream hipblasSetStream
-#define cublasSgemm hipblasSgemm
-#define cublasStatus_t hipblasStatus_t
-#define cudaDeviceCanAccessPeer hipDeviceCanAccessPeer
-#define cudaDeviceDisablePeerAccess hipDeviceDisablePeerAccess
-#define cudaDeviceEnablePeerAccess hipDeviceEnablePeerAccess
-#define cudaDeviceProp hipDeviceProp_t
-#define cudaDeviceSynchronize hipDeviceSynchronize
-#define cudaError_t hipError_t
-#define cudaEventCreateWithFlags hipEventCreateWithFlags
-#define cudaEventDisableTiming hipEventDisableTiming
-#define cudaEventRecord hipEventRecord
-#define cudaEvent_t hipEvent_t
-#define cudaEventDestroy hipEventDestroy
-#define cudaFree hipFree
-#define cudaFreeHost hipHostFree
-#define cudaGetDevice hipGetDevice
-#define cudaGetDeviceCount hipGetDeviceCount
-#define cudaGetDeviceProperties hipGetDeviceProperties
-#define cudaGetErrorString hipGetErrorString
-#define cudaGetLastError hipGetLastError
-#define cudaMalloc hipMalloc
-#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
-#define cudaMemcpy hipMemcpy
-#define cudaMemcpy2DAsync hipMemcpy2DAsync
-#define cudaMemcpyAsync hipMemcpyAsync
-#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
-#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
-#define cudaMemcpyKind hipMemcpyKind
-#define cudaMemset hipMemset
-#define cudaMemsetAsync hipMemsetAsync
-#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
-#define cudaSetDevice hipSetDevice
-#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
-#define cudaStreamFireAndForget hipStreamFireAndForget
-#define cudaStreamNonBlocking hipStreamNonBlocking
-#define cudaStreamSynchronize hipStreamSynchronize
-#define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
-#define cudaStream_t hipStream_t
-#define cudaSuccess hipSuccess
-#else
-#endif // defined(GGML_USE_HIPBLAS)
-
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-#include <cmath>
-
-#include <dpct/lib_common_utils.hpp>
-
-#include <chrono>
-
-#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-#define CC_VOLTA      700
-#define CC_OFFSET_AMD 1000000
-#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
-
-#define GGML_CUDA_MAX_NODES 8192
-
-// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
-// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
-// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
-// -  7B quantum model: +100-200 MB
-// - 13B quantum model: +200-400 MB
-//
-//#define GGML_CUDA_FORCE_MMQ
-
-// TODO: improve this to be correct for more hardware
-//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
-//       probably other such cases, and not sure what happens on AMD hardware
-#if !defined(GGML_CUDA_FORCE_MMQ)
-#define CUDA_USE_TENSOR_CORES
-#endif
-
-// max batch size to use MMQ kernels when tensor cores are available
-#define MMQ_MAX_BATCH_SIZE 32
-
-#if defined(GGML_USE_HIPBLAS)
-#define __CUDA_ARCH__ 1300
-
-#if defined(__gfx1100__) || defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx1103__) || \
-    defined(__gfx1150__) || defined(__gfx1151__)
-#define RDNA3
-#endif
-
-#if defined(__gfx1030__) || defined(__gfx1031__) || defined(__gfx1032__) || defined(__gfx1033__) || \
-    defined(__gfx1034__) || defined(__gfx1035__) || defined(__gfx1036__) || defined(__gfx1037__)
-#define RDNA2
-#endif
-
-#ifndef __has_builtin
-    #define __has_builtin(x) 0
-#endif
-
-typedef int8_t int8x4_t __attribute__((ext_vector_type(4)));
-static __device__ __forceinline__ int __vsubss4(const int a, const int b) {
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-#if __has_builtin(__builtin_elementwise_sub_sat)
-    const int8x4_t c = __builtin_elementwise_sub_sat(va, vb);
-    return reinterpret_cast<const int&>(c);
-#else
-    int8x4_t c;
-    int16_t tmp;
-#pragma unroll
-    for (int i = 0; i < 4; i++) {
-        tmp = va[i] - vb[i];
-        if(tmp > std::numeric_limits<int8_t>::max()) tmp = std::numeric_limits<int8_t>::max();
-        if(tmp < std::numeric_limits<int8_t>::min()) tmp = std::numeric_limits<int8_t>::min();
-        c[i] = tmp;
-    }
-    return reinterpret_cast<int&>(c);
-#endif // __has_builtin(__builtin_elementwise_sub_sat)
-}
-
-static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
-#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(__gfx1030__)
-    c = __builtin_amdgcn_sdot4(a, b, c, false);
-#elif defined(__gfx1100__)
-    c = __builtin_amdgcn_sudot4( true, a, true, b, c, false);
-#elif defined(__gfx1010__) || defined(__gfx900__)
-    int tmp1;
-    int tmp2;
-    asm("\n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        v_mul_i32_i24 %1, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 \n \
-        v_mul_i32_i24 %2, sext(%3), sext(%4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_3 \n \
-        v_add3_u32 %0, %1, %2, %0 \n \
-        "
-        : "+v"(c), "=&v"(tmp1), "=&v"(tmp2)
-        : "v"(a), "v"(b)
-    );
-#else
-    const int8x4_t va = reinterpret_cast<const int8x4_t&>(a);
-    const int8x4_t vb = reinterpret_cast<const int8x4_t&>(b);
-    c += va[0] * vb[0] + va[1] * vb[1] + va[2] * vb[2] + va[3] * vb[3];
-#endif
-    return c;
-}
-#endif // defined(GGML_USE_HIPBLAS)
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-static_assert(sizeof(sycl::half) == sizeof(ggml_fp16_t), "wrong fp16 size");
-
-/*
-DPCT1001:63: The statement could not be removed.
-*/
-/*
-DPCT1000:64: Error handling if-stmt was detected but could not be rewritten.
-*/
-/*
-DPCT1009:65: SYCL uses exceptions to report errors and does not use the error
-codes. The original code was commented out and a warning string was inserted.
-You need to rewrite this code.
-*/
-#define CUDA_CHECK(err)                                                              \
-    do {                                                                             \
-        dpct::err0 err_ = (err);                                                     \
-        if (err_ != 0) {                                                             \
-            int id;                                                                  \
-            id = dpct::dev_mgr::instance().current_device_id();                      \
-            fprintf(                                                                 \
-                stderr, "\nCUDA error %d at %s:%d: %s\n", err_, __FILE__,            \
-                __LINE__,                                                            \
-                "cudaGetErrorString is not supported" /*cudaGetErrorString(err_)*/); \
-            fprintf(stderr, "current device: %d\n", id);                             \
-            GGML_ASSERT(!"CUDA error");                                              \
-        }                                                                            \
-    } while (0)
-
-#if DPCT_COMPAT_RT_VERSION >= 12000
-/*
-DPCT1009:106: SYCL uses exceptions to report errors and does not use the error
-codes. The original code was commented out and a warning string was inserted.
-You need to rewrite this code.
-*/
-#define CUBLAS_CHECK(err)                                                                  \
-    do {                                                                                   \
-        int err_ = (err);                                                                  \
-        if (err_ != 0) {                                                                   \
-            int id;                                                                        \
-            id = dpct::dev_mgr::instance().current_device_id();                            \
-            fprintf(                                                                       \
-                stderr, "\ncuBLAS error %d at %s:%d: %s\n", err_, __FILE__,                \
-                __LINE__,                                                                  \
-                "cublasGetStatusString is not supported" /*cublasGetStatusString(err_)*/); \
-            fprintf(stderr, "current device: %d\n", id);                                   \
-            GGML_ASSERT(!"cuBLAS error");                                                  \
-        }                                                                                  \
-    } while (0)
-#else
-#define CUBLAS_CHECK(err)                                                               \
-    do {                                                                                \
-        cublasStatus_t err_ = (err);                                                    \
-        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
-            int id;                                                                     \
-            cudaGetDevice(&id);                                                         \
-            fprintf(stderr, "\ncuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);  \
-            fprintf(stderr, "current device: %d\n", id);                                \
-            GGML_ASSERT(!"cuBLAS error");                                               \
-        }                                                                               \
-    } while (0)
-#endif // CUDART_VERSION >= 11
-
-#if DPCT_COMPAT_RT_VERSION >= 11100
-#define GGML_CUDA_ASSUME(x) __builtin_assume(x)
-#else
-#define GGML_CUDA_ASSUME(x)
-#endif // CUDART_VERSION >= 11100
-
-#ifdef GGML_CUDA_F16
-typedef half dfloat; // dequantize float
-typedef half2 dfloat2;
-#else
-typedef float dfloat; // dequantize float
-typedef sycl::float2 dfloat2;
-#endif //GGML_CUDA_F16
-
-static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __dpct_inline__ int get_int_from_uint8(const uint8_t *x8,
-                                              const int &i32) {
-    const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
-
-    int x32 = 0;
-    x32 |= x16[0] <<  0;
-    x32 |= x16[1] << 16;
-
-    return x32;
-}
-
-static __dpct_inline__ int get_int_from_int8_aligned(const int8_t *x8,
-                                                     const int &i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-static __dpct_inline__ int get_int_from_uint8_aligned(const uint8_t *x8,
-                                                      const int &i32) {
-    return *((const int *) (x8 + sizeof(int) * i32)); // assume at least 4 byte alignment
-}
-
-template <typename T>
-using to_t_cuda_t = void (*)(const void *__restrict__ x, T *__restrict__ y,
-                             int k, dpct::queue_ptr stream);
-typedef to_t_cuda_t<float> to_fp32_cuda_t;
-typedef to_t_cuda_t<sycl::half> to_fp16_cuda_t;
-
-typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v);
-typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v);
-typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
-typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-typedef void (*ggml_cuda_op_mul_mat_t)(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream);
-typedef void (*ggml_cuda_op_flatten_t)(const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst, const float *src0_dd,
-                                       const float *src1_dd, float *dst_dd,
-                                       const dpct::queue_ptr &main_stream);
-
-// QK = number of values after dequantization
-// QR = QK / number of values before dequantization
-// QI = number of 32 bit integers before dequantization
-
-#define QK4_0 32
-#define QR4_0 2
-#define QI4_0 (QK4_0 / (4 * QR4_0))
-typedef struct dpct_type_666762 {
-    sycl::half d;           // delta
-    uint8_t qs[QK4_0 / 2];  // nibbles / quants
-} block_q4_0;
-static_assert(sizeof(block_q4_0) == sizeof(ggml_fp16_t) + QK4_0 / 2, "wrong q4_0 block size/padding");
-
-#define QK4_1 32
-#define QR4_1 2
-#define QI4_1 (QK4_1 / (4 * QR4_1))
-typedef struct dpct_type_237623 {
-    sycl::half2 dm;         // dm.x = delta, dm.y = min
-    uint8_t qs[QK4_1 / 2];  // nibbles / quants
-} block_q4_1;
-static_assert(sizeof(block_q4_1) == sizeof(ggml_fp16_t) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
-
-#define QK5_0 32
-#define QR5_0 2
-#define QI5_0 (QK5_0 / (4 * QR5_0))
-typedef struct dpct_type_447594 {
-    sycl::half d;           // delta
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2];  // nibbles / quants
-} block_q5_0;
-static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
-
-#define QK5_1 32
-#define QR5_1 2
-#define QI5_1 (QK5_1 / (4 * QR5_1))
-typedef struct dpct_type_150249 {
-    sycl::half2 dm;         // dm.x = delta, dm.y = min
-    uint8_t qh[4];          // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2];  // nibbles / quants
-} block_q5_1;
-static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
-
-#define QK8_0 32
-#define QR8_0 1
-#define QI8_0 (QK8_0 / (4 * QR8_0))
-typedef struct dpct_type_281565 {
-    sycl::half d;           // delta
-    int8_t  qs[QK8_0];      // quants
-} block_q8_0;
-static_assert(sizeof(block_q8_0) == sizeof(ggml_fp16_t) + QK8_0, "wrong q8_0 block size/padding");
-
-#define QK8_1 32
-#define QR8_1 1
-#define QI8_1 (QK8_1 / (4 * QR8_1))
-typedef struct dpct_type_126688 {
-    sycl::half2 ds;         // ds.x = delta, ds.y = sum
-    int8_t  qs[QK8_0];      // quants
-} block_q8_1;
-static_assert(sizeof(block_q8_1) == 2*sizeof(ggml_fp16_t) + QK8_0, "wrong q8_1 block size/padding");
-
-typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
-typedef void (*allocate_tiles_cuda_t)(int **x_ql, sycl::half2 **x_dm,
-                                      int **x_qh, int **x_sc);
-typedef void (*load_tiles_cuda_t)(const void *__restrict__ vx,
-                                  int *__restrict__ x_ql,
-                                  sycl::half2 *__restrict__ x_dm,
-                                  int *__restrict__ x_qh,
-                                  int *__restrict__ x_sc, const int &i_offset,
-                                  const int &i_max, const int &k,
-                                  const int &blocks_per_row);
-typedef float (*vec_dot_q_mul_mat_cuda_t)(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ms,
-    const int &i, const int &j, const int &k);
-
-//================================= k-quants
-
-#ifdef GGML_QKK_64
-#define QK_K 64
-#define K_SCALE_SIZE 4
-#else
-#define QK_K 256
-#define K_SCALE_SIZE 12
-#endif
-
-#define QR2_K 4
-#define QI2_K (QK_K / (4*QR2_K))
-typedef struct dpct_type_141842 {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    sycl::half2 dm;          // super-block scale for quantized scales/mins
-} block_q2_K;
-static_assert(sizeof(block_q2_K) == 2*sizeof(ggml_fp16_t) + QK_K/16 + QK_K/4, "wrong q2_K block size/padding");
-
-#define QR3_K 4
-#define QI3_K (QK_K / (4*QR3_K))
-typedef struct dpct_type_116731 {
-    uint8_t hmask[QK_K/8];     // quants - high bit
-    uint8_t qs[QK_K/4];        // quants - low 2 bits
-#ifdef GGML_QKK_64
-    uint8_t scales[2]; // scales, quantized with 8 bits
-#else
-    uint8_t scales[K_SCALE_SIZE]; // scales, quantized with 6 bits
-#endif
-    sycl::half d; // super-block scale
-} block_q3_K;
-//static_assert(sizeof(block_q3_K) == sizeof(ggml_fp16_t) + QK_K / 4 + QK_K / 8 + K_SCALE_SIZE, "wrong q3_K block size/padding");
-
-#define QR4_K 2
-#define QI4_K (QK_K / (4*QR4_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half    dm[2];             // super-block scales/mins
-    uint8_t scales[2];         // 4-bit block scales/mins
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == sizeof(half2) + QK_K/2 + 2, "wrong q4_K block size/padding");
-#else
-typedef struct dpct_type_801816 {
-    sycl::half2 dm;            // super-block scale for quantized scales/mins
-    uint8_t scales[3*QK_K/64]; // scales, quantized with 6 bits
-    uint8_t qs[QK_K/2];        // 4--bit quants
-} block_q4_K;
-static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_fp16_t) + 3*QK_K/64 + QK_K/2, "wrong q4_K block size/padding");
-#endif
-
-#define QR5_K 2
-#define QI5_K (QK_K / (4*QR5_K))
-#ifdef GGML_QKK_64
-typedef struct {
-    half d;                  // super-block scale
-    int8_t scales[QK_K/16];  // block scales
-    uint8_t qh[QK_K/8];      // quants, high bit
-    uint8_t qs[QK_K/2];      // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == sizeof(ggml_fp16_t) + QK_K/2 + QK_K/8 + QK_K/16, "wrong q5_K block size/padding");
-#else
-typedef struct dpct_type_105170 {
-    sycl::half2 dm;               // super-block scale for quantized scales/mins
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];           // quants, high bit
-    uint8_t qs[QK_K/2];           // quants, low 4 bits
-} block_q5_K;
-static_assert(sizeof(block_q5_K) == 2*sizeof(ggml_fp16_t) + K_SCALE_SIZE + QK_K/2 + QK_K/8, "wrong q5_K block size/padding");
-#endif
-
-#define QR6_K 2
-#define QI6_K (QK_K / (4*QR6_K))
-typedef struct dpct_type_341564 {
-    uint8_t ql[QK_K/2];   // quants, lower 4 bits
-    uint8_t qh[QK_K/4];   // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales
-    sycl::half d;            // delta
-} block_q6_K;
-static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding");
-
-#define WARP_SIZE 32
-#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
-
-#define CUDA_GELU_BLOCK_SIZE 256
-#define CUDA_SILU_BLOCK_SIZE 256
-#define CUDA_TANH_BLOCK_SIZE 256
-#define CUDA_RELU_BLOCK_SIZE 256
-#define CUDA_SQR_BLOCK_SIZE 256
-#define CUDA_CPY_BLOCK_SIZE 32
-#define CUDA_SCALE_BLOCK_SIZE 256
-#define CUDA_CLAMP_BLOCK_SIZE 256
-#define CUDA_ROPE_BLOCK_SIZE 256
-#define CUDA_SOFT_MAX_BLOCK_SIZE 1024
-#define CUDA_ALIBI_BLOCK_SIZE 32
-#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
-#define CUDA_QUANTIZE_BLOCK_SIZE 256
-#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
-#define CUDA_GET_ROWS_BLOCK_SIZE 256
-#define CUDA_UPSCALE_BLOCK_SIZE 256
-#define CUDA_CONCAT_BLOCK_SIZE 256
-#define CUDA_PAD_BLOCK_SIZE 256
-#define CUDA_ACC_BLOCK_SIZE 256
-#define CUDA_IM2COL_BLOCK_SIZE 256
-
-// dmmv = dequantize_mul_mat_vec
-#ifndef GGML_CUDA_DMMV_X
-#define GGML_CUDA_DMMV_X 32
-#endif
-#ifndef GGML_CUDA_MMV_Y
-#define GGML_CUDA_MMV_Y 1
-#endif
-
-#ifndef K_QUANTS_PER_ITERATION
-#define K_QUANTS_PER_ITERATION 2
-#else
-static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUANTS_PER_ITERATION must be 1 or 2");
-#endif
-
-#ifndef GGML_CUDA_PEER_MAX_BATCH_SIZE
-#define GGML_CUDA_PEER_MAX_BATCH_SIZE 128
-#endif // GGML_CUDA_PEER_MAX_BATCH_SIZE
-
-#define MUL_MAT_SRC1_COL_STRIDE 128
-
-#define MAX_STREAMS 8
-static dpct::queue_ptr g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = {
-    {&dpct::get_in_order_queue()}};
-
-struct ggml_tensor_extra_gpu {
-    void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
-    dpct::event_ptr events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS];
-    std::chrono::time_point<std::chrono::steady_clock> events_ct1_0[16];
-    std::chrono::time_point<std::chrono::steady_clock>
-        events_ct1_is[16] ; // events for synchronizing multiple GPUs
-};
-
-// this is faster on Windows
-// probably because the Windows CUDA libraries forget to make this check before invoking the drivers
-inline dpct::err0 ggml_cuda_set_device(const int device) try {
-    int current_device;
-    current_device = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(current_device = dpct::dev_mgr::instance().current_device_id());
-
-    if (device == current_device) {
-        return 0;
-    }
-
-    /*
-    DPCT1093:66: The "device" device may be not the one intended for use. Adjust
-    the selected device if needed.
-    */
-    return DPCT_CHECK_ERROR(dpct::select_device(device));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static int g_device_count = -1;
-static int g_main_device = 0;
-static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
-static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
-
-static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 0; // disabled by default
-static size_t g_scratch_offset = 0;
-
-static dpct::queue_ptr g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
-
-static __dpct_inline__ float warp_reduce_sum(float x,
-                                             const sycl::nd_item<3> &item_ct1) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:0: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        /*
-        DPCT1096:127: The right-most dimension of the work-group used in the
-        SYCL kernel that calls this function may be less than "32". The function
-        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
-        CPU device. Modify the size of the work-group to ensure that the value
-        of the right-most dimension is a multiple of "32".
-        */
-        x += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), x, mask);
-    }
-    return x;
-}
-
-static __dpct_inline__ sycl::float2
-warp_reduce_sum(sycl::float2 a, const sycl::nd_item<3> &item_ct1) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:1: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        a.x() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.x(),
-                                                mask);
-        /*
-        DPCT1023:2: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        a.y() += dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), a.y(),
-                                                mask);
-    }
-    return a;
-}
-
-static __dpct_inline__ float warp_reduce_max(float x,
-                                             const sycl::nd_item<3> &item_ct1) {
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:3: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        /*
-        DPCT1096:126: The right-most dimension of the work-group used in the
-        SYCL kernel that calls this function may be less than "32". The function
-        "dpct::permute_sub_group_by_xor" may return an unexpected result on the
-        CPU device. Modify the size of the work-group to ensure that the value
-        of the right-most dimension is a multiple of "32".
-        */
-        x = sycl::fmax(x, dpct::permute_sub_group_by_xor(
-                              item_ct1.get_sub_group(), x, mask));
-    }
-    return x;
-}
-
-static __dpct_inline__ float op_repeat(const float a, const float b) {
-    return b;
-}
-
-static __dpct_inline__ float op_add(const float a, const float b) {
-    return a + b;
-}
-
-static __dpct_inline__ float op_mul(const float a, const float b) {
-    return a * b;
-}
-
-static __dpct_inline__ float op_div(const float a, const float b) {
-    return a / b;
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-    const int i0s = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int i1 = (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1));
-    const int i2 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) /
-                   ne3;
-    const int i3 = (item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                    item_ct1.get_local_id(0)) %
-                   ne3;
-
-    if (i0s >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    for (int i0 = i0s; i0 < ne0;
-         i0 += item_ct1.get_local_range(2) * item_ct1.get_group_range(2)) {
-        const int i10 = i0 % ne10;
-        dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-    }
-}
-
-template<float (*bin_op)(const float, const float), typename src0_t, typename src1_t, typename dst_t>
-static void k_bin_bcast_unravel(const src0_t * src0, const src1_t * src1, dst_t * dst,
-        int ne0, int ne1, int ne2, int ne3,
-        int ne10, int ne11, int ne12, int ne13,
-        /*int s0, */ int s1,  int s2,  int s3,
-        /*int s10,*/ int s11, int s12, int s13,
-        const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    const int i3 = i/(ne2*ne1*ne0);
-    const int i2 = (i/(ne1*ne0)) % ne2;
-    const int i1 = (i/ne0) % ne1;
-    const int i0 = i % ne0;
-
-    if (i0 >= ne0 || i1 >= ne1 || i2 >= ne2 || i3 >= ne3) {
-        return;
-    }
-
-    const int i11 = i1 % ne11;
-    const int i12 = i2 % ne12;
-    const int i13 = i3 % ne13;
-
-    const size_t i_src0 = i3*s3 + i2*s2 + i1*s1;
-    const size_t i_src1 = i13*s13 + i12*s12 + i11*s11;
-    const size_t i_dst  = i_src0;
-
-    const src0_t * src0_row = src0 + i_src0;
-    const src1_t * src1_row = src1 + i_src1;
-    dst_t * dst_row = dst + i_dst;
-
-    const int i10 = i0 % ne10;
-    dst_row[i0] = (dst_t)bin_op(src0 ? (float)src0_row[i0] : 0.0f, (float)src1_row[i10]);
-}
-
-static void acc_f32(const float * x, const float * y, float * dst, const int ne,
-    const int ne10, const int ne11, const int ne12,
-    const int nb1, const int nb2, int offset, const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= ne) {
-        return;
-    }
-    int src1_idx = i - offset;
-    int oz = src1_idx / nb2;
-    int oy = (src1_idx - (oz * nb2)) / nb1;
-    int ox = src1_idx % nb1;
-    if (src1_idx >= 0 && ox < ne10 && oy < ne11 && oz < ne12) {
-        dst[i] = x[i] + y[ox + oy * ne10 + oz * ne10 * ne11];
-    } else {
-        dst[i] = x[i];
-    }
-}
-
-static void gelu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const float GELU_COEF_A    = 0.044715f;
-    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    float xi = x[i];
-    dst[i] = 0.5f * xi *
-             (1.0f +
-              sycl::tanh(SQRT_2_OVER_PI * xi * (1.0f + GELU_COEF_A * xi * xi)));
-}
-
-static void silu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] / (1.0f + sycl::native::exp(-x[i]));
-}
-
-static void gelu_quick_f32(const float *x, float *dst, int k,
-                           const sycl::nd_item<3> &item_ct1) {
-    const float GELU_QUICK_COEF = -1.702f;
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * (1.0f / (1.0f + sycl::native::exp(GELU_QUICK_COEF * x[i])));
-}
-
-static void tanh_f32(const float *x, float *dst, int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::tanh((float)(x[i]));
-}
-
-static void relu_f32(const float * x, float * dst, const int k,
-                     const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0);
-}
-
-static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-    if (i >= k) {
-        return;
-    }
-    dst[i] = sycl::fmax((float)(x[i]), (float)0) +
-             sycl::fmin((float)(x[i]), 0.0f) * negative_slope;
-}
-
-static void sqr_f32(const float * x, float * dst, const int k,
-                    const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-    dst[i] = x[i] * x[i];
-}
-
-template <int block_size>
-static void norm_f32(const float * x, float * dst, const int ncols, const float eps,
-                     const sycl::nd_item<3> &item_ct1, sycl::float2 *s_sum) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    const int tid = item_ct1.get_local_id(2);
-
-    sycl::float2 mean_var = sycl::float2(0.f, 0.f);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        mean_var.x() += xi;
-        mean_var.y() += xi * xi;
-    }
-
-    // sum up partial sums
-    mean_var = warp_reduce_sum(mean_var, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = mean_var;
-        }
-        /*
-        DPCT1118:4: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-        mean_var = s_sum[lane_id];
-        mean_var = warp_reduce_sum(mean_var, item_ct1);
-    }
-
-    const float mean = mean_var.x() / ncols;
-    const float var = mean_var.y() / ncols - mean * mean;
-    const float inv_std = sycl::rsqrt(var + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = (x[row*ncols + col] - mean) * inv_std;
-    }
-}
-
-static void concat_f32(const float  *x,const float  *y, float *dst, const int ne0, const int ne02,
-                       const sycl::nd_item<3> &item_ct1) {
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    if (item_ct1.get_group(0) < ne02) { // src0
-        int offset_src =
-            nidx + item_ct1.get_group(1) * ne0 +
-            item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-            dst[offset_dst] = x[offset_src];
-    } else {
-        int offset_src =
-            nidx + item_ct1.get_group(1) * ne0 +
-            (item_ct1.get_group(0) - ne02) * ne0 * item_ct1.get_group_range(1);
-            dst[offset_dst] = y[offset_src];
-    }
-}
-
-static void upscale_f32(const float  *x, float *dst, const int ne00, const int nb02, const int scale_factor,
-                        const sycl::nd_item<3> &item_ct1) {
-    int ne0 = ne00 * scale_factor;
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-    // operation
-    int i00 = nidx / scale_factor;
-    int i01 = item_ct1.get_group(1) / scale_factor;
-    int offset_src = i00 + i01 * ne00 + item_ct1.get_group(0) * nb02;
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    dst[offset_dst] = x[offset_src];
-}
-
-static void pad_f32(const float  *x, float *dst, const int ne0, const int ne00, const int ne01, const int ne02,
-                    const sycl::nd_item<3> &item_ct1) {
-    int nidx = item_ct1.get_local_id(2) +
-               item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (nidx >= ne0) {
-        return;
-    }
-
-    // operation
-    int offset_dst = nidx + item_ct1.get_group(1) * ne0 +
-                     item_ct1.get_group(0) * ne0 * item_ct1.get_group_range(1);
-    if (nidx < ne00 && item_ct1.get_group(1) < ne01 &&
-        item_ct1.get_group(0) < ne02) {
-        int offset_src = nidx + item_ct1.get_group(1) * ne00 +
-                         item_ct1.get_group(0) * ne00 * ne01;
-            dst[offset_dst] = x[offset_src];
-    } else {
-        dst[offset_dst] = 0.0f;
-    }
-}
-
-template <int block_size>
-static void group_norm_f32(const float * x, float * dst, const int group_size, const int ne_elements, const float eps,
-                           const sycl::nd_item<3> &item_ct1, float *s_sum) {
-    int start = item_ct1.get_group(2) * group_size;
-    int end = start + group_size;
-
-    start += item_ct1.get_local_id(2);
-
-    if (end >= ne_elements) {
-        end = ne_elements;
-    }
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int j = start; j < end; j += block_size) {
-        tmp += x[j];
-    }
-
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:5: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:67: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    float mean = tmp / group_size;
-    tmp = 0.0f;
-
-    for (int j = start; j < end; j += block_size) {
-        float xi = x[j] - mean;
-        dst[j] = xi;
-        tmp += xi * xi;
-    }
-
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:6: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:68: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    float variance = tmp / group_size;
-    float scale = sycl::rsqrt(variance + eps);
-    for (int j = start; j < end; j += block_size) {
-        dst[j] *= scale;
-    }
-}
-
-template <int block_size>
-static void rms_norm_f32(const float * x, float * dst, const int ncols, const float eps,
-                         const sycl::nd_item<3> &item_ct1, float *s_sum) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    const int tid = item_ct1.get_local_id(2);
-
-    float tmp = 0.0f; // partial sum for thread in warp
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const float xi = x[row*ncols + col];
-        tmp += xi * xi;
-    }
-
-    // sum up partial sums
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-
-        int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-        int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-        if (lane_id == 0) {
-            s_sum[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:7: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        item_ct1.barrier(sycl::access::fence_space::local_space);
-        tmp = s_sum[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    const float mean = tmp / ncols;
-    const float scale = sycl::rsqrt(mean + eps);
-
-    for (int col = tid; col < ncols; col += block_size) {
-        dst[row*ncols + col] = scale * x[row*ncols + col];
-    }
-}
-
-static __dpct_inline__ void dequantize_q4_0(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q4_0 * x = (const block_q4_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x() = vui & 0xF;
-    v.y() = vui >> 4;
-
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {8.0f, 8.0f});
-    v = __hmul2(v, {d, d});
-#else
-    v.x() = (v.x() - 8.0f) * d;
-    v.y() = (v.y() - 8.0f) * d;
-#endif // GGML_CUDA_F16
-}
-
-static __dpct_inline__ void dequantize_q4_1(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q4_1 * x = (const block_q4_1 *) vx;
-
-    const dfloat d = x[ib].dm[1];
-    const dfloat m = x[ib].dm[0];
-
-    const int vui = x[ib].qs[iqs];
-
-    v.x() = vui & 0xF;
-    v.y() = vui >> 4;
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-#else
-    v.x() = (v.x() * d) + m;
-    v.y() = (v.y() * d) + m;
-#endif // GGML_CUDA_F16
-}
-
-static __dpct_inline__ void dequantize_q5_0(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q5_0 * x = (const block_q5_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
-
-#ifdef GGML_CUDA_F16
-    v = __hsub2(v, {16.0f, 16.0f});
-    v = __hmul2(v, {d, d});
-#else
-    v.x() = (v.x() - 16.0f) * d;
-    v.y() = (v.y() - 16.0f) * d;
-#endif // GGML_CUDA_F16
-}
-
-static __dpct_inline__ void dequantize_q5_1(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q5_1 * x = (const block_q5_1 *) vx;
-
-    const dfloat d = x[ib].dm[1];
-    const dfloat m = x[ib].dm[0];
-
-    uint32_t qh;
-    memcpy(&qh, x[ib].qh, sizeof(qh));
-
-    const int xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
-    const int xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
-
-    v.x() = ((x[ib].qs[iqs] & 0xf) | xh_0);
-    v.y() = ((x[ib].qs[iqs] >> 4) | xh_1);
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-    v = __hadd2(v, {m, m});
-#else
-    v.x() = (v.x() * d) + m;
-    v.y() = (v.y() * d) + m;
-#endif // GGML_CUDA_F16
-}
-
-static __dpct_inline__ void dequantize_q8_0(const void *vx, const int ib,
-                                            const int iqs, dfloat2 &v) {
-    const block_q8_0 * x = (const block_q8_0 *) vx;
-
-    const dfloat d = x[ib].d;
-
-    v.x() = x[ib].qs[iqs + 0];
-    v.y() = x[ib].qs[iqs + 1];
-
-#ifdef GGML_CUDA_F16
-    v = __hmul2(v, {d, d});
-#else
-    v.x() *= d;
-    v.y() *= d;
-#endif // GGML_CUDA_F16
-}
-
-//================================== k-quants
-
-template<typename dst_t>
-static void dequantize_block_q2_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_group(2);
-    const block_q2_K * x = (const block_q2_K *) vx;
-
-    const int tid = item_ct1.get_local_id(2);
-#if QK_K == 256
-    const int n   = tid/32;
-    const int l   = tid - 32*n;
-    const int is  = 8*n + l/16;
-
-    const uint8_t q = x[i].qs[32*n + l];
-    dst_t * y = yy + i*QK_K + 128*n;
-
-    float dall = x[i].dm[1];
-    float dmin = x[i].dm[0];
-    y[l+ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[l+32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 2) & 3) - dmin * (x[i].scales[is+2] >> 4);
-    y[l+64] = dall * (x[i].scales[is+4] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+4] >> 4);
-    y[l+96] = dall * (x[i].scales[is+6] & 0xF) * ((q >> 6) & 3) - dmin * (x[i].scales[is+6] >> 4);
-#else
-    const int is = tid/16;  // 0 or 1
-    const int il = tid%16;  // 0...15
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    dst_t * y = yy + i*QK_K + 16*is + il;
-    float dall = __low2half(x[i].dm);
-    float dmin = __high2half(x[i].dm);
-    y[ 0] = dall * (x[i].scales[is+0] & 0xF) * ((q >> 0) & 3) - dmin * (x[i].scales[is+0] >> 4);
-    y[32] = dall * (x[i].scales[is+2] & 0xF) * ((q >> 4) & 3) - dmin * (x[i].scales[is+2] >> 4);
-#endif
-
-}
-
-template<typename dst_t>
-static void dequantize_block_q3_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-
-    const int i = item_ct1.get_group(2);
-    const block_q3_K * x = (const block_q3_K *) vx;
-
-#if QK_K == 256
-    const int r = item_ct1.get_local_id(2) / 4;
-    const int tid = r/2;
-    const int is0 = r%2;
-    const int l0 = 16 * is0 + 4 * (item_ct1.get_local_id(2) % 4);
-    const int n = tid / 4;
-    const int j = tid - 4*n;
-
-    uint8_t m = 1 << (4*n + j);
-    int is = 8*n + 2*j + is0;
-    int shift = 2*j;
-
-    int8_t us = is <  4 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+8] >> 0) & 3) << 4) :
-                is <  8 ? (x[i].scales[is-0] & 0xF) | (((x[i].scales[is+4] >> 2) & 3) << 4) :
-                is < 12 ? (x[i].scales[is-8] >>  4) | (((x[i].scales[is+0] >> 4) & 3) << 4) :
-                          (x[i].scales[is-8] >>  4) | (((x[i].scales[is-4] >> 6) & 3) << 4);
-    float d_all = x[i].d;
-    float dl = d_all * (us - 32);
-
-    dst_t * y = yy + i*QK_K + 128*n + 32*j;
-    const uint8_t * q = x[i].qs + 32*n;
-    const uint8_t * hm = x[i].hmask;
-
-    for (int l = l0; l < l0+4; ++l) y[l] = dl * ((int8_t)((q[l] >> shift) & 3) - ((hm[l] & m) ? 0 : 4));
-#else
-    const int tid = threadIdx.x;
-    const int is  = tid/16;  // 0 or 1
-    const int il  = tid%16;  // 0...15
-    const int im  = il/8;    // 0...1
-    const int in  = il%8;    // 0...7
-
-    dst_t * y = yy + i*QK_K + 16*is + il;
-
-    const uint8_t q = x[i].qs[il] >> (2*is);
-    const uint8_t h = x[i].hmask[in] >> (2*is + im);
-    const float   d = (float)x[i].d;
-
-    if (is == 0) {
-        y[ 0] = d * ((x[i].scales[0] & 0xF) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] & 0xF) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    } else {
-        y[ 0] = d * ((x[i].scales[0] >>  4) - 8) * ((int8_t)((q >> 0) & 3) - ((h >> 0) & 1 ? 0 : 4));
-        y[32] = d * ((x[i].scales[1] >>  4) - 8) * ((int8_t)((q >> 4) & 3) - ((h >> 4) & 1 ? 0 : 4));
-    }
-#endif
-
-}
-
-#if QK_K == 256
-static inline void get_scale_min_k4(int j, const uint8_t * q, uint8_t & d, uint8_t & m) {
-    if (j < 4) {
-        d = q[j] & 63; m = q[j + 4] & 63;
-    } else {
-        d = (q[j+4] & 0xF) | ((q[j-4] >> 6) << 4);
-        m = (q[j+4] >>  4) | ((q[j-0] >> 6) << 4);
-    }
-}
-#endif
-
-template<typename dst_t>
-static void dequantize_block_q4_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-    const block_q4_K * x = (const block_q4_K *) vx;
-
-    const int i = item_ct1.get_group(2);
-
-#if QK_K == 256
-    // assume 32 threads
-    const int tid = item_ct1.get_local_id(2);
-    const int il  = tid/8;
-    const int ir  = tid%8;
-    const int is  = 2*il;
-    const int n   = 4;
-
-    dst_t * y = yy + i*QK_K + 64*il + n*ir;
-
-    const float dall = x[i].dm[1];
-    const float dmin = x[i].dm[0];
-
-    const uint8_t * q = x[i].qs + 32*il + n*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-    for (int l = 0; l < n; ++l) {
-        y[l + 0] = d1 * (q[l] & 0xF) - m1;
-        y[l +32] = d2 * (q[l] >>  4) - m2;
-    }
-#else
-    const int tid = threadIdx.x;
-    const uint8_t * q = x[i].qs;
-    dst_t * y = yy + i*QK_K;
-    const float d = (float)x[i].dm[0];
-    const float m = (float)x[i].dm[1];
-    y[tid+ 0] = d * (x[i].scales[0] & 0xF) * (q[tid] & 0xF) - m * (x[i].scales[0] >> 4);
-    y[tid+32] = d * (x[i].scales[1] & 0xF) * (q[tid] >>  4) - m * (x[i].scales[1] >> 4);
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_block_q5_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-    const block_q5_K * x = (const block_q5_K *) vx;
-
-    const int i = item_ct1.get_group(2);
-
-#if QK_K == 256
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = item_ct1.get_local_id(2);
-    const int il  = tid/16;   // il is in 0...3
-    const int ir  = tid%16;   // ir is in 0...15
-    const int is  = 2*il;     // is is in 0...6
-
-    dst_t * y = yy + i*QK_K + 64*il + 2*ir;
-
-    const float dall = x[i].dm[1];
-    const float dmin = x[i].dm[0];
-
-    const uint8_t * ql = x[i].qs + 32*il + 2*ir;
-    const uint8_t * qh = x[i].qh + 2*ir;
-
-    uint8_t sc, m;
-    get_scale_min_k4(is + 0, x[i].scales, sc, m);
-    const float d1 = dall * sc; const float m1 = dmin * m;
-    get_scale_min_k4(is + 1, x[i].scales, sc, m);
-    const float d2 = dall * sc; const float m2 = dmin * m;
-
-    uint8_t   hm  = 1 << (2*il);
-    y[ 0] = d1 * ((ql[ 0] & 0xF) + (qh[ 0] & hm ? 16 : 0)) - m1;
-    y[ 1] = d1 * ((ql[ 1] & 0xF) + (qh[ 1] & hm ? 16 : 0)) - m1;
-    hm <<= 1;
-    y[32] = d2 * ((ql[ 0] >>  4) + (qh[ 0] & hm ? 16 : 0)) - m2;
-    y[33] = d2 * ((ql[ 1] >>  4) + (qh[ 1] & hm ? 16 : 0)) - m2;
-#else
-    const int tid = threadIdx.x;
-    const uint8_t q = x[i].qs[tid];
-    const int im = tid/8;  // 0...3
-    const int in = tid%8;  // 0...7
-    const int is = tid/16; // 0 or 1
-    const uint8_t h = x[i].qh[in] >> im;
-    const float d = x[i].d;
-    dst_t * y = yy + i*QK_K + tid;
-    y[ 0] = d * x[i].scales[is+0] * ((q & 0xF) - ((h >> 0) & 1 ? 0 : 16));
-    y[32] = d * x[i].scales[is+2] * ((q >>  4) - ((h >> 4) & 1 ? 0 : 16));
-#endif
-}
-
-template<typename dst_t>
-static void dequantize_block_q6_K(const void * __restrict__ vx, dst_t * __restrict__ yy,
-                                  const sycl::nd_item<3> &item_ct1) {
-    const block_q6_K * x = (const block_q6_K *) vx;
-
-    const int i = item_ct1.get_group(2);
-#if QK_K == 256
-
-    // assume 64 threads - this is very slightly better than the one below
-    const int tid = item_ct1.get_local_id(2);
-    const int ip  = tid/32;   // ip is 0 or 1
-    const int il  = tid - 32*ip; // 0...32
-    const int is  = 8*ip + il/16;
-
-    dst_t * y = yy + i*QK_K + 128*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t * ql = x[i].ql + 64*ip + il;
-    const uint8_t   qh = x[i].qh[32*ip + il];
-    const int8_t  * sc = x[i].scales + is;
-
-    y[ 0] = d * sc[0] * ((int8_t)((ql[ 0] & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[2] * ((int8_t)((ql[32] & 0xF) | (((qh >> 2) & 3) << 4)) - 32);
-    y[64] = d * sc[4] * ((int8_t)((ql[ 0]  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-    y[96] = d * sc[6] * ((int8_t)((ql[32]  >> 4) | (((qh >> 6) & 3) << 4)) - 32);
-#else
-
-    // assume 32 threads
-    const int tid = threadIdx.x;
-    const int ip  = tid/16;         // 0 or 1
-    const int il  = tid - 16*ip;    // 0...15
-
-    dst_t * y = yy + i*QK_K + 16*ip + il;
-
-    const float d = x[i].d;
-
-    const uint8_t   ql = x[i].ql[16*ip + il];
-    const uint8_t   qh = x[i].qh[il] >> (2*ip);
-    const int8_t  * sc = x[i].scales;
-
-    y[ 0] = d * sc[ip+0] * ((int8_t)((ql & 0xF) | (((qh >> 0) & 3) << 4)) - 32);
-    y[32] = d * sc[ip+2] * ((int8_t)((ql  >> 4) | (((qh >> 4) & 3) << 4)) - 32);
-#endif
-}
-
-/*
-DPCT1110:8: The total declared local variable size in device function
-dequantize_mul_mat_vec_q2_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q2_K * x = (const block_q2_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...15
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15 or 0...14 in steps of 2
-    const int q_offset = 32*im + l0;
-    const int s_offset = 8*im;
-    const int y_offset = 128*im + l0;
-
-    uint32_t aux[4];
-    const uint8_t * d = (const uint8_t *)aux;
-    const uint8_t * m = (const uint8_t *)(aux + 2);
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-
-        const float dall = x[i].dm[1];
-        const float dmin = x[i].dm[0];
-
-        const uint32_t * a = (const uint32_t *)(x[i].scales + s_offset);
-        aux[0] = a[0] & 0x0f0f0f0f;
-        aux[1] = a[1] & 0x0f0f0f0f;
-        aux[2] = (a[0] >> 4) & 0x0f0f0f0f;
-        aux[3] = (a[1] >> 4) & 0x0f0f0f0f;
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            sum1 += y[l+ 0] * d[0] * ((q[l+ 0] >> 0) & 3)
-                  + y[l+32] * d[2] * ((q[l+ 0] >> 2) & 3)
-                  + y[l+64] * d[4] * ((q[l+ 0] >> 4) & 3)
-                  + y[l+96] * d[6] * ((q[l+ 0] >> 6) & 3)
-                  + y[l+16] * d[1] * ((q[l+16] >> 0) & 3)
-                  + y[l+48] * d[3] * ((q[l+16] >> 2) & 3)
-                  + y[l+80] * d[5] * ((q[l+16] >> 4) & 3)
-                  +y[l+112] * d[7] * ((q[l+16] >> 6) & 3);
-            sum2 += y[l+ 0] * m[0] + y[l+32] * m[2] + y[l+64] * m[4] + y[ l+96] * m[6]
-                  + y[l+16] * m[1] + y[l+48] * m[3] + y[l+80] * m[5] + y[l+112] * m[7];
-
-        }
-        tmp += dall * sum1 - dmin * sum2;
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;
-
-    uint32_t uaux[2];
-    const uint8_t * d = (const uint8_t *)uaux;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint32_t * s = (const uint32_t *)x[i].scales;
-
-        uaux[0] = s[0] & 0x0f0f0f0f;
-        uaux[1] = (s[0] >> 4) & 0x0f0f0f0f;
-
-        const float2 dall = __half22float2(x[i].dm);
-
-        float sum1 = 0, sum2 = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t ql = q[l];
-            sum1 += y[l+ 0] * d[0] * ((ql >> 0) & 3)
-                  + y[l+16] * d[1] * ((ql >> 2) & 3)
-                  + y[l+32] * d[2] * ((ql >> 4) & 3)
-                  + y[l+48] * d[3] * ((ql >> 6) & 3);
-            sum2 += y[l+0] * d[4] + y[l+16] * d[5] + y[l+32] * d[6] + y[l+48] * d[7];
-        }
-        tmp += dall.x * sum1 - dall.y * sum2;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:9: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-/*
-DPCT1110:10: The total declared local variable size in device function
-dequantize_mul_mat_vec_q3_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q3_K * x = (const block_q3_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-
-    const uint16_t kmask1 = 0x0303;
-    const uint16_t kmask2 = 0x0f0f;
-
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
-
-    const int n  = K_QUANTS_PER_ITERATION;               // iterations in the inner loop
-    const int step = 16/K_QUANTS_PER_ITERATION;
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0....15 or 0...7
-
-    const uint8_t m = 1 << (4*im);
-
-    const int l0 = n*in;                                 // 0...15 or 0...14 in steps of 2
-    const int q_offset =  32*im + l0;
-    const int y_offset = 128*im + l0;
-
-    uint16_t utmp[4];
-    const int8_t * s = (const int8_t *)utmp;
-
-    const uint16_t s_shift = 4*im;
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * q = x[i].qs + q_offset;
-        const uint8_t * h = x[i].hmask + l0;
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        utmp[0] = ((a[0] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[1] = ((a[1] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 0)) & kmask1) << 4);
-        utmp[2] = ((a[2] >> s_shift) & kmask2) | (((a[4] >> (s_shift + 2)) & kmask1) << 4);
-        utmp[3] = ((a[3] >> s_shift) & kmask2) | (((a[5] >> (s_shift + 2)) & kmask1) << 4);
-
-        const float d = x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < n; ++l) {
-            sum += y[l+ 0] * (s[0] - 32) * (((q[l] >> 0) & 3) - (h[l] & (m << 0) ? 0 : 4))
-                 + y[l+32] * (s[2] - 32) * (((q[l] >> 2) & 3) - (h[l] & (m << 1) ? 0 : 4))
-                 + y[l+64] * (s[4] - 32) * (((q[l] >> 4) & 3) - (h[l] & (m << 2) ? 0 : 4))
-                 + y[l+96] * (s[6] - 32) * (((q[l] >> 6) & 3) - (h[l] & (m << 3) ? 0 : 4));
-            sum += y[l+16] * (s[1] - 32) * (((q[l+16] >> 0) & 3) - (h[l+16] & (m << 0) ? 0 : 4))
-                 + y[l+48] * (s[3] - 32) * (((q[l+16] >> 2) & 3) - (h[l+16] & (m << 1) ? 0 : 4))
-                 + y[l+80] * (s[5] - 32) * (((q[l+16] >> 4) & 3) - (h[l+16] & (m << 2) ? 0 : 4))
-                + y[l+112] * (s[7] - 32) * (((q[l+16] >> 6) & 3) - (h[l+16] & (m << 3) ? 0 : 4));
-        }
-        tmp += d * sum;
-
-    }
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15 or 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0....1 or 0...3
-    const int offset = tid * K_QUANTS_PER_ITERATION;         // 0...15 or 0...14
-    const int in = offset/8;                                 // 0 or 1
-    const int im = offset%8;                                 // 0...7
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y = yy + i * QK_K + offset;
-        const uint8_t * q = x[i].qs + offset;
-        const uint8_t * s = x[i].scales;
-
-        const float dall = (float)x[i].d;
-
-        float sum = 0;
-        for (int l = 0; l < K_QUANTS_PER_ITERATION; ++l) {
-            const uint8_t hl = x[i].hmask[im+l] >> in;
-            const uint8_t ql = q[l];
-            sum += y[l+ 0] * dall * ((s[0] & 0xF) - 8) * ((int8_t)((ql >> 0) & 3) - ((hl >> 0) & 1 ? 0 : 4))
-                 + y[l+16] * dall * ((s[0] >>  4) - 8) * ((int8_t)((ql >> 2) & 3) - ((hl >> 2) & 1 ? 0 : 4))
-                 + y[l+32] * dall * ((s[1] & 0xF) - 8) * ((int8_t)((ql >> 4) & 3) - ((hl >> 4) & 1 ? 0 : 4))
-                 + y[l+48] * dall * ((s[1] >>  4) - 8) * ((int8_t)((ql >> 6) & 3) - ((hl >> 6) & 1 ? 0 : 4));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:11: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-/*
-DPCT1110:12: The total declared local variable size in device function
-dequantize_mul_mat_vec_q4_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q4_K * x = (const block_q4_K *)vx + ib0;
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0,1
-
-    const int step = 8/K_QUANTS_PER_ITERATION;           // 8 or 4
-
-    const int il  = tid/step;                            // 0...3
-    const int ir  = tid - step*il;                       // 0...7 or 0...3
-    const int n   = 2 * K_QUANTS_PER_ITERATION;          // 2 or 4
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-#if K_QUANTS_PER_ITERATION == 2
-    uint32_t q32[4];
-    const uint8_t * q4 = (const uint8_t *)q32;
-#else
-    uint16_t q16[4];
-    const uint8_t * q4 = (const uint8_t *)q16;
-#endif
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y1 = yy + i*QK_K + y_offset;
-        const float   * y2 = y1 + 128;
-
-        const float dall = x[i].dm[1];
-        const float dmin = x[i].dm[0];
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-#if K_QUANTS_PER_ITERATION == 2
-        const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset);
-        const uint32_t * q2 = q1 + 16;
-
-        q32[0] = q1[0] & 0x0f0f0f0f;
-        q32[1] = q1[0] & 0xf0f0f0f0;
-        q32[2] = q2[0] & 0x0f0f0f0f;
-        q32[3] = q2[0] & 0xf0f0f0f0;
-
-        sycl::float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 4; ++l) {
-            s.x() += y1[l] * q4[l + 0]; s.y() += y1[l + 32] * q4[l + 4];
-            s.z() += y2[l] * q4[l + 8]; s.w() += y2[l + 32] * q4[l + 12];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x() * sc[0] + s.y() * sc[1] * 1.f / 16.f +
-                       s.z() * sc[4] + s.w() * sc[5] * 1.f / 16.f) -
-               dmin * smin;
-#else
-        const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset);
-        const uint16_t * q2 = q1 + 32;
-
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[0] & 0xf0f0;
-        q16[2] = q2[0] & 0x0f0f;
-        q16[3] = q2[0] & 0xf0f0;
-
-        float4 s = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        for (int l = 0; l < 2; ++l) {
-            s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2];
-            s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6];
-            smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7];
-        }
-        tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin;
-#endif
-
-    }
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    float tmp = 0;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const float   * y = yy + i*QK_K + step;
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux16[0] = a[0] & 0x0f0f;
-        aux16[1] = (a[0] >> 4) & 0x0f0f;
-        const float d = (float)x[i].dm[0];
-        const float m = (float)x[i].dm[1];
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * (d * s[0] * (q[j+ 0] & 0xF) - m * s[2])
-                 + y[j+16] * (d * s[0] * (q[j+16] & 0xF) - m * s[2])
-                 + y[j+32] * (d * s[1] * (q[j+ 0] >>  4) - m * s[3])
-                 + y[j+48] * (d * s[1] * (q[j+16] >>  4) - m * s[3]);
-        }
-        tmp += sum;
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:13: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-/*
-DPCT1110:14: The total declared local variable size in device function
-dequantize_mul_mat_vec_q5_k exceeds 128 bytes and may cause high register
-pressure. Consult with your hardware vendor to find the total register size
-available and adjust the code, or use smaller sub-group size to avoid high
-register pressure.
-*/
-static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
-                                        const float *__restrict__ yy,
-                                        float *__restrict__ dst,
-                                        const int ncols,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    const int row = item_ct1.get_group(2);
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q5_K * x = (const block_q5_K *)vx + ib0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-#if QK_K == 256
-    const uint16_t kmask1 = 0x3f3f;
-    const uint16_t kmask2 = 0x0f0f;
-    const uint16_t kmask3 = 0xc0c0;
-
-    const int tid = item_ct1.get_local_id(2) / 2; // 0...15
-    const int ix = item_ct1.get_local_id(2) % 2;
-
-    const int il  = tid/4;     // 0...3
-    const int ir  = tid - 4*il;// 0...3
-    const int n   = 2;
-
-    const int im = il/2;  // 0 or 1. 0 computes 0,32 + 128,160, 1 computes 64,96 + 192,224
-    const int in = il%2;
-
-    const int l0 = n*(2*ir + in);
-    const int q_offset = 32*im + l0;
-    const int y_offset = 64*im + l0;
-
-    const uint8_t hm1  = 1 << (2*im);
-    const uint8_t hm2  = hm1 << 4;
-
-    uint16_t aux[4];
-    const uint8_t * sc = (const uint8_t *)aux;
-
-    uint16_t q16[8];
-    const uint8_t * q4 = (const uint8_t *)q16;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2) {
-
-        const uint8_t * ql1 = x[i].qs + q_offset;
-        const uint8_t * qh  = x[i].qh + l0;
-        const float   * y1  = yy + i*QK_K + y_offset;
-        const float   * y2  = y1 + 128;
-
-        const float dall = x[i].dm[1];
-        const float dmin = x[i].dm[0];
-
-        const uint16_t * a = (const uint16_t *)x[i].scales;
-        aux[0] = a[im+0] & kmask1;
-        aux[1] = a[im+2] & kmask1;
-        aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2);
-        aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2);
-
-        sycl::float4 sum = {0.f, 0.f, 0.f, 0.f};
-        float smin = 0;
-        const uint16_t * q1 = (const uint16_t *)ql1;
-        const uint16_t * q2 = q1 + 32;
-        q16[0] = q1[0] & 0x0f0f;
-        q16[1] = q1[8] & 0x0f0f;
-        q16[2] = (q1[0] >> 4) & 0x0f0f;
-        q16[3] = (q1[8] >> 4) & 0x0f0f;
-        q16[4] = q2[0] & 0x0f0f;
-        q16[5] = q2[8] & 0x0f0f;
-        q16[6] = (q2[0] >> 4) & 0x0f0f;
-        q16[7] = (q2[8] >> 4) & 0x0f0f;
-        for (int l = 0; l < n; ++l) {
-            sum.x() +=
-                y1[l + 0] * (q4[l + 0] + (qh[l + 0] & (hm1 << 0) ? 16 : 0)) +
-                y1[l + 16] * (q4[l + 2] + (qh[l + 16] & (hm1 << 0) ? 16 : 0));
-            sum.y() +=
-                y1[l + 32] * (q4[l + 4] + (qh[l + 0] & (hm1 << 1) ? 16 : 0)) +
-                y1[l + 48] * (q4[l + 6] + (qh[l + 16] & (hm1 << 1) ? 16 : 0));
-            sum.z() +=
-                y2[l + 0] * (q4[l + 8] + (qh[l + 0] & (hm2 << 0) ? 16 : 0)) +
-                y2[l + 16] * (q4[l + 10] + (qh[l + 16] & (hm2 << 0) ? 16 : 0));
-            sum.w() +=
-                y2[l + 32] * (q4[l + 12] + (qh[l + 0] & (hm2 << 1) ? 16 : 0)) +
-                y2[l + 48] * (q4[l + 14] + (qh[l + 16] & (hm2 << 1) ? 16 : 0));
-            smin += (y1[l] + y1[l+16]) * sc[2] + (y1[l+32] + y1[l+48]) * sc[3]
-                  + (y2[l] + y2[l+16]) * sc[6] + (y2[l+32] + y2[l+48]) * sc[7];
-        }
-        tmp += dall * (sum.x() * sc[0] + sum.y() * sc[1] + sum.z() * sc[4] +
-                       sum.w() * sc[5]) -
-               dmin * smin;
-    }
-
-#else
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...15
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);
-    const int step = tid * K_QUANTS_PER_ITERATION;
-    const int im = step/8;
-    const int in = step%8;
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-        const uint8_t * q = x[i].qs + step;
-        const int8_t  * s = x[i].scales;
-        const float   * y = yy + i*QK_K + step;
-        const float     d = x[i].d;
-        float sum = 0.f;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            const uint8_t h = x[i].qh[in+j] >> im;
-            sum += y[j+ 0] * d * s[0] * ((q[j+ 0] & 0xF) - ((h >> 0) & 1 ? 0 : 16))
-                 + y[j+16] * d * s[1] * ((q[j+16] & 0xF) - ((h >> 2) & 1 ? 0 : 16))
-                 + y[j+32] * d * s[2] * ((q[j+ 0] >>  4) - ((h >> 4) & 1 ? 0 : 16))
-                 + y[j+48] * d * s[3] * ((q[j+16] >>  4) - ((h >> 6) & 1 ? 0 : 16));
-        }
-        tmp += sum;
-    }
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:15: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows,
-                                        const sycl::nd_item<3> &item_ct1) {
-
-    static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION");
-
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-    if (row > nrows) return;
-
-    const int num_blocks_per_row = ncols / QK_K;
-    const int ib0 = row*num_blocks_per_row;
-
-    const block_q6_K * x = (const block_q6_K *)vx + ib0;
-
-#if QK_K == 256
-
-    const int tid =
-        item_ct1.get_local_id(2) / K_QUANTS_PER_ITERATION; // 0...31 or 0...16
-    const int ix =
-        item_ct1.get_local_id(2) % K_QUANTS_PER_ITERATION; // 0 or 0, 1
-
-    const int step = 16/K_QUANTS_PER_ITERATION;          // 16 or 8
-
-    const int im = tid/step;                             // 0 or 1. 0 computes 0..., 1 computes 128...
-    const int in = tid - step*im;                        // 0...15 or 0...7
-
-#if K_QUANTS_PER_ITERATION == 1
-    const int l0 = K_QUANTS_PER_ITERATION*in;            // 0...15
-    const int is = 0;
-#else
-    const int l0 = 4 * in;                               // 0, 4, 8, ..., 28
-    const int is = in / 4;
-#endif
-    const int ql_offset = 64*im + l0;
-    const int qh_offset = 32*im + l0;
-    const int s_offset  =  8*im + is;
-    const int y_offset = 128*im + l0;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + y_offset;
-        const uint8_t * ql = x[i].ql + ql_offset;
-        const uint8_t * qh = x[i].qh + qh_offset;
-        const int8_t  * s  = x[i].scales + s_offset;
-
-        const float d = x[i].d;
-
-#if K_QUANTS_PER_ITERATION == 1
-        float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
-                  + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
-                  + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
-                  + y[48] * s[3] * d * ((int8_t)((ql[48] & 0xF) | ((qh[16] & 0x0c) << 2)) - 32)
-                  + y[64] * s[4] * d * ((int8_t)((ql[ 0]  >> 4) | ((qh[ 0] & 0x30) >> 0)) - 32)
-                  + y[80] * s[5] * d * ((int8_t)((ql[16]  >> 4) | ((qh[16] & 0x30) >> 0)) - 32)
-                  + y[96] * s[6] * d * ((int8_t)((ql[32]  >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
-                  +y[112] * s[7] * d * ((int8_t)((ql[48]  >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
-        tmp += sum;
-#else
-        float sum = 0;
-        for (int l = 0; l < 4; ++l) {
-            sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
-                 + y[l+32] * s[2] * d * ((int8_t)((ql[l+32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32)
-                 + y[l+64] * s[4] * d * ((int8_t)((ql[l+ 0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32)
-                 + y[l+96] * s[6] * d * ((int8_t)((ql[l+32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
-        }
-        tmp += sum;
-#endif
-
-    }
-
-#else
-
-    const int tid = threadIdx.x/(2*K_QUANTS_PER_ITERATION);  // 0...7
-    const int ix  = threadIdx.x%(2*K_QUANTS_PER_ITERATION);  // 0...3
-
-    const int step = tid * K_QUANTS_PER_ITERATION;
-
-    float tmp = 0; // partial sum for thread in warp
-
-    for (int i = ix; i < num_blocks_per_row; i += 2*K_QUANTS_PER_ITERATION) {
-
-        const float   * y  = yy + i * QK_K + step;
-        const uint8_t * ql = x[i].ql + step;
-        const uint8_t * qh = x[i].qh + step;
-        const int8_t  * s  = x[i].scales;
-
-        const float d = x[i+0].d;
-
-        float sum = 0;
-        for (int j = 0; j < K_QUANTS_PER_ITERATION; ++j) {
-            sum += y[j+ 0] * s[0] * d * ((int8_t)((ql[j+ 0] & 0xF) | ((qh[j] & 0x03) << 4)) - 32)
-                 + y[j+16] * s[1] * d * ((int8_t)((ql[j+16] & 0xF) | ((qh[j] & 0x0c) << 2)) - 32)
-                 + y[j+32] * s[2] * d * ((int8_t)((ql[j+ 0] >>  4) | ((qh[j] & 0x30) >> 0)) - 32)
-                 + y[j+48] * s[3] * d * ((int8_t)((ql[j+16] >>  4) | ((qh[j] & 0xc0) >> 2)) - 32);
-        }
-        tmp += sum;
-
-    }
-
-#endif
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:16: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (tid == 0) {
-        dst[row] = tmp;
-    }
-}
-
-static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const sycl::half *x = (const sycl::half *)vx;
-
-    // automatic half -> float type cast if dfloat == float
-    v.x() = x[ib + iqs + 0];
-    v.y() = x[ib + iqs + 1];
-}
-
-static void convert_f32(const void * vx, const int ib, const int iqs, dfloat2 & v){
-    const float * x = (const float *) vx;
-
-    // automatic half -> float type cast if dfloat == float
-    v.x() = x[ib + iqs + 0];
-    v.y() = x[ib + iqs + 1];
-}
-
-static void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int kx, const int kx_padded,
-                          const sycl::nd_item<3> &item_ct1) {
-    const int ix = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                   item_ct1.get_local_id(2);
-
-    if (ix >= kx_padded) {
-        return;
-    }
-
-    const int iy = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                   item_ct1.get_local_id(1);
-
-    const int i_padded = iy*kx_padded + ix;
-
-    block_q8_1 * y = (block_q8_1 *) vy;
-
-    const int ib = i_padded / QK8_1; // block index
-    const int iqs = i_padded % QK8_1; // quant index
-
-    const float xi = ix < kx ? x[iy*kx + ix] : 0.0f;
-    float amax = sycl::fabs((float)xi);
-    float sum = xi;
-
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:17: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        amax = sycl::fmax(amax, dpct::permute_sub_group_by_xor(
-                                    item_ct1.get_sub_group(), amax, mask));
-        /*
-        DPCT1023:18: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        sum +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), sum, mask);
-    }
-
-    const float d = amax / 127;
-    const int8_t q = amax == 0.0f ? 0 : sycl::round(xi / d);
-
-    y[ib].qs[iqs] = q;
-
-    if (iqs > 0) {
-        return;
-    }
-
-    reinterpret_cast<sycl::half &>(y[ib].ds.x()) = d;
-    reinterpret_cast<sycl::half &>(y[ib].ds.y()) = sum;
-}
-
-template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void k_get_rows(
-            const void * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12,
-            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
-
-    const int i00 = (item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                     item_ct1.get_local_id(2)) *
-                    2;
-    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) /
-                    ne12;
-    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) %
-                    ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const void * src0_row = (const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03;
-
-    const int ib = i00/qk; // block index
-    const int iqs = (i00%qk)/qr; // quant index
-    const int iybs = i00 - i00%qk; // dst block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(src0_row, ib, iqs, v);
-
-    dst_row[iybs + iqs + 0] = v.x();
-    dst_row[iybs + iqs + y_offset] = v.y();
-}
-
-template<typename src0_t, typename dst_t>
-static void k_get_rows_float(
-            const src0_t * src0, const int32_t * src1, dst_t * dst,
-            int64_t ne00, /*int64_t ne01, int64_t ne02, int64_t ne03,*/
-            /*int64_t ne10, int64_t ne11,*/ int64_t ne12, /*int64_t ne13,*/
-            /*size_t s0,*/ size_t s1, size_t s2, size_t s3,
-            /*size_t nb00,*/ size_t nb01, size_t nb02, size_t nb03,
-            size_t s10, size_t s11, size_t s12,
-            const sycl::nd_item<3> &item_ct1/*, size_t s13*/) {
-
-    const int i00 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-                    item_ct1.get_local_id(2);
-    const int i10 = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i11 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) /
-                    ne12;
-    const int i12 = (item_ct1.get_group(0) * item_ct1.get_local_range(0) +
-                     item_ct1.get_local_id(0)) %
-                    ne12;
-
-    if (i00 >= ne00) {
-        return;
-    }
-
-    const int i01 = src1[i10*s10 + i11*s11 + i12*s12];
-
-    dst_t * dst_row = dst + i10*s1 + i11*s2 + i12*s3;
-    const src0_t * src0_row = (const src0_t *)((const char *)src0 + i01*nb01 + i11*nb02 + i12*nb03);
-
-    dst_row[i00] = src0_row[i00];
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k,
-                             const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  2 * item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    const int ib = i/qk; // block index
-    const int iqs = (i%qk)/qr; // quant index
-    const int iybs = i - i%qk; // y block start index
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-    // dequantize
-    dfloat2 v;
-    dequantize_kernel(vx, ib, iqs, v);
-
-    y[iybs + iqs + 0] = v.x();
-    y[iybs + iqs + y_offset] = v.y();
-}
-
-// VDR = vec dot ratio, how many contiguous integers each thread processes when the vec dot kernel is called
-// MMVQ = mul_mat_vec_q, MMQ = mul_mat_q
-
-#define VDR_Q4_0_Q8_1_MMVQ 2
-#define VDR_Q4_0_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
-                                                    const float &d4,
-                                                    const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
-    }
-
-    const float2 ds8f = __half22float2(ds8);
-
-    // second part effectively subtracts 8 from each quant value
-    return d4 * (sumi * ds8f.x - (8*vdr/QI4_0) * ds8f.y);
-#else
-    /*
-    DPCT1007:69: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q4_1_Q8_1_MMVQ 2
-#define VDR_Q4_1_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
-                                                    const sycl::half2 &dm4,
-                                                    const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        const int vi0 = (v[i] >> 0) & 0x0F0F0F0F;
-        const int vi1 = (v[i] >> 4) & 0x0F0F0F0F;
-
-        // SIMD dot product of quantized values
-        sumi = __dp4a(vi0, u[2*i+0], sumi);
-        sumi = __dp4a(vi1, u[2*i+1], sumi);
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm4, ds8));
-    const float d4d8 = tmp.x;
-    const float m4s8 = tmp.y;
-#else
-    const float2 dm4f = __half22float2(dm4);
-    const float2 ds8f = __half22float2(ds8);
-    const float d4d8 = dm4f.x * ds8f.x;
-    const float m4s8 = dm4f.y * ds8f.y;
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI8_1/(vdr * QR4_1) to compensate for multiple threads adding it
-    return sumi * d4d8 + m4s8 / (QI8_1 / (vdr * QR4_1));
-#else
-    /*
-    DPCT1007:70: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_0_Q8_1_MMVQ 2
-#define VDR_Q5_0_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float
-vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
-                       const float &d5, const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
-    }
-
-    const float2 ds8f = __half22float2(ds8);
-
-    // second part effectively subtracts 16 from each quant value
-    return d5 * (sumi * ds8f.x - (16*vdr/QI5_0) * ds8f.y);
-#else
-    /*
-    DPCT1007:71: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_1_Q8_1_MMVQ 2
-#define VDR_Q5_1_Q8_1_MMQ  4
-
-template <int vdr>
-static __dpct_inline__ float
-vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
-                       const sycl::half2 &dm5, const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        int vi0 = (vl[i] >>  0) & 0x0F0F0F0F; // lower 4 qs bits, still need qh as 5th bits
-        vi0    |= (vh[i] <<  4) & 0x00000010; // 0 ->  4
-        vi0    |= (vh[i] << 11) & 0x00001000; // 1 -> 12
-        vi0    |= (vh[i] << 18) & 0x00100000; // 2 -> 20
-        vi0    |= (vh[i] << 25) & 0x10000000; // 3 -> 28
-        sumi = __dp4a(vi0, u[2*i+0], sumi); // SIMD dot product of quantized values
-
-        int vi1 = (vl[i] >>  4) & 0x0F0F0F0F; // upper 4 qs bits, still need qh as 5th bits
-        vi1    |= (vh[i] >> 12) & 0x00000010; // 16 ->  4
-        vi1    |= (vh[i] >>  5) & 0x00001000; // 17 -> 12
-        vi1    |= (vh[i] <<  2) & 0x00100000; // 18 -> 20
-        vi1    |= (vh[i] <<  9) & 0x10000000; // 19 -> 28
-        sumi = __dp4a(vi1, u[2*i+1], sumi); // SIMD dot product of quantized values
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm5, ds8));
-    const float d5d8 = tmp.x;
-    const float m5s8 = tmp.y;
-#else
-    const float2 dm5f = __half22float2(dm5);
-    const float2 ds8f = __half22float2(ds8);
-    const float d5d8 = dm5f.x * ds8f.x;
-    const float m5s8 = dm5f.y * ds8f.y;
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI5_1 / vdr to compensate for multiple threads adding it
-    return sumi*d5d8 + m5s8 / (QI5_1 / vdr);
-
-#else
-    /*
-    DPCT1007:72: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q8_0_Q8_1_MMVQ 2
-#define VDR_Q8_0_Q8_1_MMQ 8
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
-                                                    const float &d8_0,
-                                                    const float &d8_1) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
-    }
-
-    return d8_0*d8_1 * sumi;
-#else
-    /*
-    DPCT1007:73: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-template <int vdr>
-static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
-                                                    const sycl::half2 &dm8,
-                                                    const sycl::half2 &ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i = 0; i < vdr; ++i) {
-        // SIMD dot product of quantized values
-        sumi = __dp4a(v[i], u[i], sumi);
-    }
-
-#ifdef GGML_CUDA_F16
-    const float2 tmp = __half22float2(__hmul2(dm8, ds8));
-    const float d8d8 = tmp.x;
-    const float m8s8 = tmp.y;
-#else
-    const float2 dm8f = __half22float2(dm8);
-    const float2 ds8f = __half22float2(ds8);
-    const float d8d8 = dm8f.x * ds8f.x;
-    const float m8s8 = dm8f.y * ds8f.y;
-#endif // GGML_CUDA_F16
-
-    // scale second part of sum by QI8_1/ vdr to compensate for multiple threads adding it
-    return sumi*d8d8 + m8s8 / (QI8_1 / vdr);
-#else
-    /*
-    DPCT1007:74: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q2_K_Q8_1_MMVQ 1
-#define VDR_Q2_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
-    const int &v, const int *__restrict__ u, const uint8_t *__restrict__ scales,
-    const sycl::half2 &dm2, const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++i) {
-        const int sc = scales[2*i];
-
-        const int vi = (v >> (2*i)) & 0x03030303;
-
-        sumf_d += d8[i] * (__dp4a(vi, u[i], 0) * (sc & 0xF)); // SIMD dot product
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-        sumf_m += d8[i] * __dp4a(m, u[i], 0); // multiply constant q2_K part with sum of q8_1 values
-    }
-
-    const float2 dm2f = __half22float2(dm2);
-
-    return dm2f.x*sumf_d - dm2f.y*sumf_m;
-#else
-    /*
-    DPCT1007:75: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float
-vec_dot_q2_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
-                           const uint8_t *__restrict__ scales,
-                           const sycl::half2 &dm2, const float &d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi_d = 0;
-    int sumi_m = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QI8_1; i0 += QI8_1/2) {
-        int sumi_d_sc = 0;
-
-        const int sc = scales[i0 / (QI8_1/2)];
-
-        // fill int with 4x m
-        int m = sc >> 4;
-        m |= m <<  8;
-        m |= m << 16;
-
-#pragma unroll
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_d_sc = __dp4a(v[i], u[i], sumi_d_sc); // SIMD dot product
-            sumi_m    = __dp4a(m,    u[i], sumi_m); // multiply sum of q8_1 values with m
-        }
-
-        sumi_d += sumi_d_sc * (sc & 0xF);
-    }
-
-    const float2 dm2f = __half22float2(dm2);
-
-    return d8 * (dm2f.x*sumi_d - dm2f.y*sumi_m);
-#else
-    /*
-    DPCT1007:76: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q3_K_Q8_1_MMVQ 1
-#define VDR_Q3_K_Q8_1_MMQ  2
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
-    const int &vl, const int &vh, const int *__restrict__ u,
-    const uint8_t *__restrict__ scales, const int &scale_offset,
-    const float &d3, const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        const int isc = scale_offset + 2*i;
-
-        const int isc_low = isc % (QK_K/32);
-        const int sc_shift_low = 4 * (isc / (QK_K/32));
-        const int sc_low  = (scales[isc_low] >> sc_shift_low) & 0xF;
-
-        const int isc_high = isc % (QK_K/64);
-        const int sc_shift_high = 2 * (isc / (QK_K/64));
-        const int sc_high = ((scales[(QK_K/32) + isc_high] >> sc_shift_high) & 3) << 4;
-
-        const int sc = (sc_low | sc_high) - 32;
-
-        const int vil = (vl >> (2*i)) & 0x03030303;
-
-        const int vih = ((vh >> i) << 2) & 0x04040404;
-
-        const int vi = __vsubss4(vil, vih);
-
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d3 * sumf;
-#else
-    /*
-    DPCT1007:77: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float
-vec_dot_q3_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
-                           const int8_t *__restrict__ scales, const float &d3,
-                           const float &d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    int sumi = 0;
-
-#pragma unroll
-    for (int i0 = 0; i0 < QR3_K*VDR_Q3_K_Q8_1_MMQ; i0 += QI8_1/2) {
-        int sumi_sc = 0;
-
-        for (int i = i0; i < i0 + QI8_1/2; ++i) {
-            sumi_sc = __dp4a(v[i], u[i], sumi_sc); // SIMD dot product
-        }
-
-        sumi += sumi_sc * scales[i0 / (QI8_1/2)];
-    }
-
-    return d3*d8 * sumi;
-#else
-    /*
-    DPCT1007:78: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q4_K_Q8_1_MMVQ 2
-#define VDR_Q4_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
-    const int *__restrict__ v, const int *__restrict__ u,
-    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K; ++i) {
-        const int v0i = (v[0] >> (4*i)) & 0x0F0F0F0F;
-        const int v1i = (v[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int dot1 = __dp4a(v1i, u[2*i+1], __dp4a(v0i, u[2*i+0], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+1], __dp4a(0x01010101, u[2*i+0], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);  // multiply constant part of q4_K with sum of q8_1 values
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
-    /*
-    DPCT1007:79: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_mmq(
-    const int *__restrict__ v, const int *__restrict__ u,
-    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR4_K*VDR_Q4_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a((v[j] >> (4*i)) & 0x0F0F0F0F, u[i*QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const float2 ds8f = __half22float2(ds8[i]);
-
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
-    /*
-    DPCT1007:80: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q5_K_Q8_1_MMVQ 2
-#define VDR_Q5_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
-    const int *__restrict__ vl, const int *__restrict__ vh,
-    const int *__restrict__ u, const uint8_t *__restrict__ sc,
-    const uint8_t *__restrict__ m, const sycl::half2 &dm5,
-    const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const int vl0i = (vl[0] >> (4*i)) & 0x0F0F0F0F;
-        const int vl1i = (vl[1] >> (4*i)) & 0x0F0F0F0F;
-
-        const int vh0i = ((vh[0] >> i) << 4) & 0x10101010;
-        const int vh1i = ((vh[1] >> i) << 4) & 0x10101010;
-
-        const int v0i = vl0i | vh0i;
-        const int v1i = vl1i | vh1i;
-
-        const int dot1 = __dp4a(v0i, u[2*i+0], __dp4a(v1i, u[2*i+1], 0)); // SIMD dot product
-        const int dot2 = __dp4a(0x01010101, u[2*i+0], __dp4a(0x01010101, u[2*i+1], 0)); // sum of u
-
-        sumf_d += d8[i] * (dot1 * sc[i]);
-        sumf_m += d8[i] * (dot2 * m[i]);
-
-    }
-
-    const float2 dm5f = __half22float2(dm5);
-
-    return dm5f.x*sumf_d - dm5f.y*sumf_m;
-
-#else
-    /*
-    DPCT1007:81: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_mmq(
-    const int *__restrict__ v, const int *__restrict__ u,
-    const uint8_t *__restrict__ sc, const uint8_t *__restrict__ m,
-    const sycl::half2 &dm4, const sycl::half2 *__restrict__ ds8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K*VDR_Q5_K_Q8_1_MMQ/QI8_1; ++i) {
-        int sumi_d = 0;
-
-#pragma unroll
-        for (int j = 0; j < QI8_1; ++j) {
-            sumi_d = __dp4a(v[i*QI8_1 + j], u[i*QI8_1 + j], sumi_d); // SIMD dot product
-        }
-
-        const float2 ds8f = __half22float2(ds8[i]);
-
-        sumf_d += ds8f.x * (sc[i] * sumi_d);
-        sumf_m += ds8f.y *   m[i]; // sum of q8_1 block * q4_K min val
-    }
-
-    const float2 dm4f = __half22float2(dm4);
-
-    return dm4f.x*sumf_d - dm4f.y*sumf_m;
-
-#else
-    /*
-    DPCT1007:82: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-#define VDR_Q6_K_Q8_1_MMVQ 1
-#define VDR_Q6_K_Q8_1_MMQ  8
-
-// contiguous v/x values
-static __dpct_inline__ float
-vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
-                            const int *__restrict__ u,
-                            const int8_t *__restrict__ scales, const float &d,
-                            const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf = 0.0f;
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        const int sc = scales[4*i];
-
-        const int vil = (vl >> (4*i)) & 0x0F0F0F0F;
-
-        const int vih = ((vh >> (4*i)) << 4) & 0x30303030;
-
-        const int vi = __vsubss4((vil | vih), 0x20202020); // vi = (vil | vih) - 32
-
-        sumf += d8[i] * (__dp4a(vi, u[i], 0) * sc); // SIMD dot product
-    }
-
-    return d*sumf;
-#else
-    /*
-    DPCT1007:83: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-// contiguous u/y values
-static __dpct_inline__ float
-vec_dot_q6_K_q8_1_impl_mmq(const int *__restrict__ v, const int *__restrict__ u,
-                           const int8_t *__restrict__ sc, const float &d6,
-                           const float *__restrict__ d8) {
-
-#if DPCT_COMPATIBILITY_TEMP >=                                                 \
-    MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    float sumf_d = 0.0f;
-
-#pragma unroll
-    for (int i0 = 0; i0 < VDR_Q6_K_Q8_1_MMQ; i0 += 4) {
-        int2 sumi_d = {0, 0}; // 2 q6_K scales per q8_1 scale
-
-#pragma unroll
-        for (int i = i0; i < i0 + 2; ++i) {
-            sumi_d.x = __dp4a(v[2*i+0], u[2*i+0], sumi_d.x); // SIMD dot product
-            sumi_d.x = __dp4a(v[2*i+1], u[2*i+1], sumi_d.x); // SIMD dot product
-
-            sumi_d.y = __dp4a(v[2*i+4], u[2*i+4], sumi_d.y); // SIMD dot product
-            sumi_d.y = __dp4a(v[2*i+5], u[2*i+5], sumi_d.y); // SIMD dot product
-        }
-
-        sumf_d += d8[i0/4] * (sc[i0/2+0]*sumi_d.x + sc[i0/2+1]*sumi_d.y);
-    }
-
-    return d6 * sumf_d;
-
-#else
-    /*
-    DPCT1007:84: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-}
-
-static __dpct_inline__ float
-vec_dot_q4_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
-
-    int v[VDR_Q4_0_Q8_1_MMVQ];
-    int u[2*VDR_Q4_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_0_Q8_1_MMVQ; ++i) {
-        v[i]     = get_int_from_uint8(bq4_0->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_0);
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q4_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs, float *tile_x_d) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_qs;
-    *x_dm = (sycl::half2 *)tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q4_0(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_0;
-    const int kqsx = k % QI4_0;
-
-    const block_q4_0 * bx0 = (const block_q4_0 *) vx;
-
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-        // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_0) {
-        int i = i0 + i_offset * QI4_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q4_0_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const float * x_dmf = (const float *) x_dm;
-
-    int u[2*VDR_Q4_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dmf[i * (WARP_SIZE/QI4_0) + i/QI4_0 + k/QI4_0],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q4_1_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
-
-    int v[VDR_Q4_1_Q8_1_MMVQ];
-    int u[2*VDR_Q4_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
-        v[i]    = get_int_from_uint8_aligned(bq4_1->qs, iqs + i);
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI4_1);
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q4_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs, sycl::half2 *tile_x_dm) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_qs;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q4_1(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_1;
-    const int kqsx = k % QI4_1;
-
-    const block_q4_1 * bx0 = (const block_q4_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_1) {
-        int i = i0 + i_offset * QI4_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q4_1_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-
-    int u[2*VDR_Q4_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q4_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI4_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], u, x_dm[i * (WARP_SIZE/QI4_1) + i/QI4_1 + k/QI4_1],
-         y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q5_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
-
-    int vl[VDR_Q5_0_Q8_1_MMVQ];
-    int vh[VDR_Q5_0_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_0_Q8_1_MMVQ; ++i) {
-        vl[i]    = get_int_from_uint8(bq5_0->qs, iqs + i);
-        vh[i]    = get_int_from_uint8(bq5_0->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_0);
-    }
-
-    return vec_dot_q5_0_q8_1_impl<VDR_Q5_0_Q8_1_MMVQ>(vl, vh, u, bq5_0->d, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q5_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, float *tile_x_d) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_ql;
-    *x_dm = (sycl::half2 *)tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q5_0(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_0;
-    const int kqsx = k % QI5_0;
-
-    const block_q5_0 * bx0 = (const block_q5_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8(bxi->qh, 0) >> (4 * (k % QI5_0));
-
-        int qs0 = (ql >>  0)   & 0x0F0F0F0F;
-        qs0    |= (qh <<  4)   & 0x00000010;  // 0 ->  4
-        qs0    |= (qh << 11)   & 0x00001000;  // 1 -> 12
-        qs0    |= (qh << 18)   & 0x00100000;  // 2 -> 20
-        qs0    |= (qh << 25)   & 0x10000000;  // 3 -> 28
-        qs0 = dpct::vectorized_binary<sycl::char4>(
-            qs0, 0x10101010, dpct::sub_sat()); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4)   & 0x0F0F0F0F;
-        qs1    |= (qh >> 12)   & 0x00000010;  // 16 ->  4
-        qs1    |= (qh >>  5)   & 0x00001000;  // 17 -> 12
-        qs1    |= (qh <<  2)   & 0x00100000;  // 18 -> 20
-        qs1    |= (qh <<  9)   & 0x10000000;  // 19 -> 28
-        qs1 = dpct::vectorized_binary<sycl::char4>(
-            qs1, 0x10101010, dpct::sub_sat()); // subtract 16
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_0) {
-        int i = i0 + i_offset * QI5_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI5_0) + i / QI5_0 + kbxd] = bxi->d;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q5_0_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    int u[2*VDR_Q5_0_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_0_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_0) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_0_q8_1_impl<QR5_0*VDR_Q5_0_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dmf[index_bx], y_df[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q5_1_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
-
-    int vl[VDR_Q5_1_Q8_1_MMVQ];
-    int vh[VDR_Q5_1_Q8_1_MMVQ];
-    int  u[2*VDR_Q5_1_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
-        vl[i]   = get_int_from_uint8_aligned(bq5_1->qs, iqs + i);
-        vh[i]   = get_int_from_uint8_aligned(bq5_1->qh, 0) >> (4 * (iqs + i));
-        u[2*i+0] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-        u[2*i+1] = get_int_from_int8_aligned(bq8_1->qs, iqs + i + QI5_1);
-    }
-
-    return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm, bq8_1->ds);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q5_1(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q5_1(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset < nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_1;
-    const int kqsx = k % QI5_1;
-
-    const block_q5_1 * bx0 = (const block_q5_1 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int qh = get_int_from_uint8_aligned(bxi->qh, 0) >> (4 * (k % QI5_1));
-
-        int qs0 = (ql >>  0) & 0x0F0F0F0F;
-        qs0    |= (qh <<  4) & 0x00000010; // 0 ->  4
-        qs0    |= (qh << 11) & 0x00001000; // 1 -> 12
-        qs0    |= (qh << 18) & 0x00100000; // 2 -> 20
-        qs0    |= (qh << 25) & 0x10000000; // 3 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+0] = qs0;
-
-        int qs1 = (ql >>  4) & 0x0F0F0F0F;
-        qs1    |= (qh >> 12) & 0x00000010; // 16 ->  4
-        qs1    |= (qh >>  5) & 0x00001000; // 17 -> 12
-        qs1    |= (qh <<  2) & 0x00100000; // 18 -> 20
-        qs1    |= (qh <<  9) & 0x10000000; // 19 -> 28
-
-        x_ql[i * (2*WARP_SIZE + 1) + 2*k+1] = qs1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_1;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_1) {
-        int i = i0 + i_offset * QI5_1 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q5_1_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2));
-    const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1;
-
-    int u[2*VDR_Q5_1_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < VDR_Q5_1_Q8_1_MMQ; ++l) {
-        u[2*l+0] = y_qs[j * WARP_SIZE + (kyqs + l)         % WARP_SIZE];
-        u[2*l+1] = y_qs[j * WARP_SIZE + (kyqs + l + QI5_1) % WARP_SIZE];
-    }
-
-    return vec_dot_q8_1_q8_1_impl<QR5_1*VDR_Q5_1_Q8_1_MMQ>
-        (&x_ql[i * (2*WARP_SIZE + 1) + 2 * k], u, x_dm[index_bx], y_ds[j * (WARP_SIZE/QI8_1) + (2*k/QI8_1) % (WARP_SIZE/QI8_1)]);
-}
-
-static __dpct_inline__ float
-vec_dot_q8_0_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
-
-    int v[VDR_Q8_0_Q8_1_MMVQ];
-    int u[VDR_Q8_0_Q8_1_MMVQ];
-
-#pragma unroll
-    for (int i = 0; i < VDR_Q8_0_Q8_1_MMVQ; ++i) {
-        v[i] = get_int_from_int8(bq8_0->qs, iqs + i);
-        u[i] = get_int_from_int8_aligned(bq8_1->qs, iqs + i);
-    }
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMVQ>(v, u, bq8_0->d,
-                                                      bq8_1->ds[1]);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q8_0(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_qs, float *tile_x_d) {
-    (void)x_qh; (void)x_sc;
-
-    *x_ql = tile_x_qs;
-    *x_dm = (sycl::half2 *)tile_x_d;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q8_0(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh; (void)x_sc;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI8_0;
-    const int kqsx = k % QI8_0;
-    float * x_dmf = (float *) x_dm;
-
-    const block_q8_0 * bx0 = (const block_q8_0 *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_int8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI8_0;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI8_0) {
-        int i = i0 + i_offset * QI8_0 + k / blocks_per_tile_x_row;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q8_0 * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI8_0) + i / QI8_0 + kbxd] = bxi->d;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q8_0_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh; (void)x_sc;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    return vec_dot_q8_0_q8_1_impl<VDR_Q8_0_Q8_1_MMQ>
-        (&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[j * WARP_SIZE + k], x_dmf[i * (WARP_SIZE/QI8_0) + i/QI8_0 + k/QI8_0],
-         y_df[j * (WARP_SIZE/QI8_1) + k/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q2_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q2_K * bq2_K = (const block_q2_K *) vbq;
-
-    const int bq8_offset = QR2_K * (iqs / QI8_1);
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const uint8_t * scales = bq2_K->scales + scale_offset;
-
-    const int v = get_int_from_uint8_aligned(bq2_K->qs, iqs);
-    int    u[QR2_K];
-    float d8[QR2_K];
-
-#pragma unroll
-    for (int i = 0; i < QR2_K; ++ i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds[1];
-    }
-
-    return vec_dot_q2_K_q8_1_impl_mmvq(v, u, scales, bq2_K->dm, d8);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q2_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q2_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI2_K;
-    const int kqsx = k % QI2_K;
-
-    const block_q2_K * bx0 = (const block_q2_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI2_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI2_K) {
-        int i = (i0 + i_offset * QI2_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dm[i * (WARP_SIZE/QI2_K) + i / QI2_K + kbxd] = bxi->dm;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q2_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI2_K/4);
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = get_int_from_uint8_aligned(bxi->scales, k % (QI2_K/4));
-    }
-}
-
-static __dpct_inline__ float vec_dot_q2_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const int kbx = k / QI2_K;
-    const int ky  = (k % QI2_K) * QR2_K;
-    const float * y_df = (const float *) y_ds;
-
-    int v[QR2_K*VDR_Q2_K_Q8_1_MMQ];
-
-    const int kqsx = i * (WARP_SIZE + 1) + kbx*QI2_K + (QI2_K/2) * (ky/(2*QI2_K)) + ky % (QI2_K/2);
-    const int shift = 2 * ((ky % (2*QI2_K)) / (QI2_K/2));
-
-#pragma unroll
-    for (int l = 0; l < QR2_K*VDR_Q2_K_Q8_1_MMQ; ++l) {
-        v[l] = (x_ql[kqsx + l] >> shift) & 0x03030303;
-    }
-
-    const uint8_t * scales = ((const uint8_t *) &x_sc[i * (WARP_SIZE/4) + i/4 + kbx*4]) + ky/4;
-
-    const int index_y = j * WARP_SIZE + (QR2_K*k) % WARP_SIZE;
-    return vec_dot_q2_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dm[i * (WARP_SIZE/QI2_K) + i/QI2_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q3_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q3_K * bq3_K = (const block_q3_K *) vbq;
-
-    const int bq8_offset = QR3_K * (iqs / (QI3_K/2));
-    const int scale_offset = iqs - iqs % QI8_1 + (iqs % QI8_1) / (QI8_1/2);
-
-    const float d = bq3_K->d;
-
-    const int vl = get_int_from_uint8(bq3_K->qs, iqs);
-
-    // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-    const int vh = ~get_int_from_uint8(bq3_K->hmask, iqs % (QI3_K/2)) >> bq8_offset;
-
-    int    u[QR3_K];
-    float d8[QR3_K];
-
-#pragma unroll
-    for (int i = 0; i < QR3_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + i].ds[1];
-    }
-
-    return vec_dot_q3_K_q8_1_impl_mmvq(vl, vh, u, bq3_K->scales, scale_offset, d, d8);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q3_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_qh,
-                    int *tile_x_sc) {
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_qh = tile_x_qh;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q3_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI3_K;
-    const int kqsx = k % QI3_K;
-
-    const block_q3_K * bx0 = (const block_q3_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI3_K;
-    const int kbxd = k % blocks_per_tile_x_row;
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI3_K) {
-        int i = (i0 + i_offset * QI3_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI3_K) + i / QI3_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 2) {
-        int i = i0 + i_offset * 2 + k / (WARP_SIZE/2);
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/2)) / (QI3_K/2);
-
-        // invert the mask with ~ so that a 0/1 results in 4/0 being subtracted
-        x_qh[i * (WARP_SIZE/2) + i / 2 + k % (WARP_SIZE/2)] = ~get_int_from_uint8(bxi->hmask, k % (QI3_K/2));
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 4) {
-        int i = i0 + i_offset * 4 + k / (WARP_SIZE/4);
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q3_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/4)) / (QI3_K/4);
-
-        const int ksc = k % (QI3_K/4);
-
-        const int ksc_low = ksc % (QI3_K/8);
-        const int shift_low = 4 * (ksc / (QI3_K/8));
-        const int sc_low = (get_int_from_uint8(bxi->scales, ksc_low) >> shift_low) & 0x0F0F0F0F;
-
-        const int ksc_high = QI3_K/8;
-        const int shift_high = 2 * ksc;
-        const int sc_high = ((get_int_from_uint8(bxi->scales, ksc_high) >> shift_high) << 4) & 0x30303030;
-
-        const int sc = dpct::vectorized_binary<sycl::char4>(
-            sc_low | sc_high, 0x20202020, dpct::sub_sat());
-
-        x_sc[i * (WARP_SIZE/4) + i / 4 + k % (WARP_SIZE/4)] = sc;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q3_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-
-    const int kbx  = k / QI3_K;
-    const int ky  = (k % QI3_K) * QR3_K;
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * scales = ((const int8_t *) (x_sc + i * (WARP_SIZE/4) + i/4 + kbx*4)) + ky/4;
-
-    int v[QR3_K*VDR_Q3_K_Q8_1_MMQ];
-
-#pragma unroll
-    for (int l = 0; l < QR3_K*VDR_Q3_K_Q8_1_MMQ; ++l) {
-        const int kqsx = i * (WARP_SIZE + 1) + kbx*QI3_K + (QI3_K/2) * (ky/(2*QI3_K)) + ky % (QI3_K/2);
-        const int shift = 2 * ((ky % 32) / 8);
-        const int vll = (x_ql[kqsx + l] >> shift) & 0x03030303;
-
-        const int vh = x_qh[i * (WARP_SIZE/2) + i/2 + kbx * (QI3_K/2) + (ky+l)%8] >> ((ky+l) / 8);
-        const int vlh = (vh << 2) & 0x04040404;
-
-        v[l] = dpct::vectorized_binary<sycl::char4>(vll, vlh, dpct::sub_sat());
-    }
-
-    const int index_y = j * WARP_SIZE + (k*QR3_K) % WARP_SIZE;
-    return vec_dot_q3_K_q8_1_impl_mmq(v, &y_qs[index_y], scales, x_dmf[i * (WARP_SIZE/QI3_K) + i/QI3_K + kbx], y_df[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    int    v[2];
-    int    u[2*QR4_K];
-    float d8[QR4_K];
-
-    // iqs is in 0,2..30. bq8_offset = iqs/4 -> bq8_offset = 0, 2, 4, 6
-    const int bq8_offset = QR4_K * ((iqs/2) / (QI8_1/2));
-
-    // iqs = 0....3 -> bq8_offset = 0, want q4_offset = 0, 4, 8, 12
-    // iqs = 4....7 -> bq8_offset = 2, want q4_offset = 32, 36, 40, 44
-    // iqs = 8...11 -> bq8_offset = 4, want q4_offset = 64, 68, 72, 76
-    // iqs = 12..15 -> bq8_offset = 6, want q4_offset = 96, 100, 104, 108
-
-    const int * q4 = (const int *)(bq4_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    v[0] = q4[0];
-    v[1] = q4[4];
-
-    const uint16_t * scales = (const uint16_t *)bq4_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-    for (int i = 0; i < QR4_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds[1];
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q4_K_q8_1_impl_vmmq(v, u, sc, m, bq4_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q4_K * bq4_K = (const block_q4_K *) vbq;
-
-    float sumf_d = 0.0f;
-    float sumf_m = 0.0f;
-
-    uint16_t aux16[2];
-    const uint8_t * s = (const uint8_t *)aux16;
-
-    const uint16_t * a = (const uint16_t *)bq4_K->scales;
-    aux16[0] = a[0] & 0x0f0f;
-    aux16[1] = (a[0] >> 4) & 0x0f0f;
-
-    const float dall = bq4_K->dm[0];
-    const float dmin = bq4_K->dm[1];
-
-    const float d8_1 = __low2float(bq8_1[0].ds);
-    const float d8_2 = __low2float(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * q4 = (const int *)bq4_K->qs + (iqs/2);
-    const int v1 = q4[0];
-    const int v2 = q4[4];
-
-    const int dot1 = __dp4a(ui2, v2 & 0x0f0f0f0f, __dp4a(ui1, v1 & 0x0f0f0f0f, 0));
-    const int dot2 = __dp4a(ui4, (v2 >> 4) & 0x0f0f0f0f, __dp4a(ui3, (v1 >> 4) & 0x0f0f0f0f, 0));
-    const int dot3 = __dp4a(0x01010101, ui2, __dp4a(0x01010101, ui1, 0));
-    const int dot4 = __dp4a(0x01010101, ui4, __dp4a(0x01010101, ui3, 0));
-
-    sumf_d += d8_1 * (dot1 * s[0]) + d8_2 * (dot2 * s[1]);
-    sumf_m += d8_1 * (dot3 * s[2]) + d8_2 * (dot4 * s[3]);
-
-    return dall * sumf_d - dmin * sumf_m;
-
-#else
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q4_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q4_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI4_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI4_K; // == k if QK_K == 256
-
-    const block_q4_K * bx0 = (const block_q4_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbx;
-
-        x_ql[i * (WARP_SIZE + 1) + k] = get_int_from_uint8_aligned(bxi->qs, kqsx);
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI4_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI4_K) {
-        int i = (i0 + i_offset * QI4_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = bxi->dm;
-#else
-        x_dm[i * (WARP_SIZE/QI4_K) + i / QI4_K + kbxd] = {bxi->dm[0], bxi->dm[1]};
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q4_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI4_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q4_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8);
-
-    const int index_y = j * WARP_SIZE + (QR4_K*k) % WARP_SIZE;
-    return vec_dot_q4_K_q8_1_impl_mmq(&x_ql[i * (WARP_SIZE + 1) + k], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI4_K) + i/QI4_K], &y_ds[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-#ifndef GGML_QKK_64
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    int   vl[2];
-    int   vh[2];
-    int    u[2*QR5_K];
-    float d8[QR5_K];
-
-    const int bq8_offset = QR5_K * ((iqs/2) / (QI8_1/2));
-    const int * ql = (const int *)(bq5_K->qs + 16 * bq8_offset + 4 * ((iqs/2)%4));
-    const int * qh = (const int *)(bq5_K->qh + 4 * ((iqs/2)%4));
-
-    vl[0] = ql[0];
-    vl[1] = ql[4];
-
-    vh[0] = qh[0] >> bq8_offset;
-    vh[1] = qh[4] >> bq8_offset;
-
-    const uint16_t * scales = (const uint16_t *)bq5_K->scales;
-    uint16_t aux[2];
-    const int j = bq8_offset/2;
-    if (j < 2) {
-        aux[0] = scales[j+0] & 0x3f3f;
-        aux[1] = scales[j+2] & 0x3f3f;
-    } else {
-        aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2);
-        aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2);
-    }
-    const uint8_t * sc = (const uint8_t *)aux;
-    const uint8_t * m  = sc + 2;
-
-#pragma unroll
-    for (int i = 0; i < QR5_K; ++i) {
-        const block_q8_1 * bq8i = bq8_1 + bq8_offset + i;
-        d8[i] = bq8i->ds[0];
-
-        const int * q8 = (const int *)bq8i->qs + ((iqs/2)%4);
-        u[2*i+0] = q8[0];
-        u[2*i+1] = q8[4];
-    }
-
-    return vec_dot_q5_K_q8_1_impl_vmmq(vl, vh, u, sc, m, bq5_K->dm, d8);
-
-#else
-
-#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
-    const block_q5_K * bq5_K = (const block_q5_K *) vbq;
-
-    const int8_t * s = bq5_K->scales;
-
-    const float d = bq5_K->d;
-
-    const float d8_1 = __low2half(bq8_1[0].ds);
-    const float d8_2 = __low2half(bq8_1[1].ds);
-
-    const int ui1 = *((const int *)bq8_1[0].qs + (iqs/2));
-    const int ui2 = *((const int *)bq8_1[0].qs + (iqs/2) + 4);
-    const int ui3 = *((const int *)bq8_1[1].qs + (iqs/2));
-    const int ui4 = *((const int *)bq8_1[1].qs + (iqs/2) + 4);
-
-    const int * ql = (const int *)bq5_K->qs + (iqs/2);
-    const int vl1 = ql[0];
-    const int vl2 = ql[4];
-
-    const int step = 4 * (iqs/2); // 0, 4, 8, 12
-    const int im = step/8; // = 0 for iqs = 0, 2, = 1 for iqs = 4, 6
-    const int in = step%8; // 0, 4, 0, 4
-    const int vh = (*((const int *)(bq5_K->qh + in))) >> im;
-
-    const int v1 = (((vh << 4) & 0x10101010) ^ 0x10101010) | ((vl1 >> 0) & 0x0f0f0f0f);
-    const int v2 = (((vh << 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 0) & 0x0f0f0f0f);
-    const int v3 = (((vh >> 0) & 0x10101010) ^ 0x10101010) | ((vl1 >> 4) & 0x0f0f0f0f);
-    const int v4 = (((vh >> 2) & 0x10101010) ^ 0x10101010) | ((vl2 >> 4) & 0x0f0f0f0f);
-
-    const float sumf_d = d8_1 * (__dp4a(ui1, v1, 0) * s[0] + __dp4a(ui2, v2, 0) * s[1])
-                       + d8_2 * (__dp4a(ui3, v3, 0) * s[2] + __dp4a(ui4, v4, 0) * s[3]);
-
-    return d * sumf_d;
-
-#else
-    assert(false);
-    return 0.0f; // only to satisfy the compiler
-#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
-
-#endif
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q5_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q5_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI5_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI5_K; // == k if QK_K == 256
-
-    const block_q5_K * bx0 = (const block_q5_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR5_K*kqsx;
-
-        const int ql = get_int_from_uint8_aligned(bxi->qs, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8_aligned(bxi->qh, kqsx % (QI5_K/4));
-        const int qh0 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 0)) << 4) & 0x10101010;
-        const int qh1 = ((qh >> (2 * (kqsx / (QI5_K/4)) + 1)) << 4) & 0x10101010;
-
-        const int kq0 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + 0;
-        const int kq1 = ky - ky % (QI5_K/2) + k % (QI5_K/4) + (QI5_K/4);
-
-        x_ql[i * (2*WARP_SIZE + 1) + kq0] = ql0 | qh0;
-        x_ql[i * (2*WARP_SIZE + 1) + kq1] = ql1 | qh1;
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI5_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI5_K) {
-        int i = (i0 + i_offset * QI5_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-#if QK_K == 256
-        x_dm[i * (WARP_SIZE/QI5_K) + i / QI5_K + kbxd] = bxi->dm;
-#endif
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q5_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / (QI5_K/8);
-
-        const int * scales = (const int *) bxi->scales;
-
-        const int ksc = k % (WARP_SIZE/8);
-
-        // scale arrangement after the following two lines: sc0,...,sc3, sc4,...,sc7, m0,...,m3, m4,...,m8
-        int scales8 = (scales[(ksc%2) + (ksc!=0)] >> (4 * (ksc & (ksc/2)))) & 0x0F0F0F0F; // lower 4 bits
-        scales8    |= (scales[ksc/2]              >> (2 * (ksc % 2)))       & 0x30303030; // upper 2 bits
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + ksc] = scales8;
-    }
-}
-
-static __dpct_inline__ float vec_dot_q5_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8);
-
-    const int index_x = i * (QR5_K*WARP_SIZE + 1) +  QR5_K*k;
-    const int index_y = j * WARP_SIZE             + (QR5_K*k) % WARP_SIZE;
-    return vec_dot_q5_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, sc+8,
-                                      x_dm[i * (WARP_SIZE/QI5_K) + i/QI5_K], &y_ds[index_y/QI8_1]);
-}
-
-static __dpct_inline__ float
-vec_dot_q6_K_q8_1(const void *__restrict__ vbq,
-                  const block_q8_1 *__restrict__ bq8_1, const int &iqs) {
-
-    const block_q6_K * bq6_K = (const block_q6_K *) vbq;
-
-    const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/4);
-    const int scale_offset = (QI6_K/4) * (iqs / (QI6_K/2)) + (iqs % (QI6_K/2)) / (QI6_K/8);
-    const int vh_shift = 2 * ((iqs % (QI6_K/2)) / (QI6_K/4));
-
-    const int vl = get_int_from_uint8(bq6_K->ql, iqs);
-    const int vh = get_int_from_uint8(bq6_K->qh, (QI6_K/4) * (iqs / (QI6_K/2)) + iqs % (QI6_K/4)) >> vh_shift;
-
-    const int8_t * scales = bq6_K->scales + scale_offset;
-
-    int    u[QR6_K];
-    float d8[QR6_K];
-
-#pragma unroll
-    for (int i = 0; i < QR6_K; ++i) {
-        u[i]  = get_int_from_int8_aligned(bq8_1[bq8_offset + 2*i].qs, iqs % QI8_1);
-        d8[i] = bq8_1[bq8_offset + 2 * i].ds[1];
-    }
-
-    return vec_dot_q6_K_q8_1_impl_mmvq(vl, vh, u, scales, bq6_K->d, d8);
-}
-
-template <int mmq_y>
-static __dpct_inline__ void
-allocate_tiles_q6_K(int **x_ql, sycl::half2 **x_dm, int **x_qh, int **x_sc,
-                    int *tile_x_ql, sycl::half2 *tile_x_dm, int *tile_x_sc) {
-    (void)x_qh;
-
-    *x_ql = tile_x_ql;
-    *x_dm = tile_x_dm;
-    *x_sc = tile_x_sc;
-}
-
-template <int mmq_y, int nwarps, bool need_check>
-static __dpct_inline__ void
-load_tiles_q6_K(const void *__restrict__ vx, int *__restrict__ x_ql,
-                sycl::half2 *__restrict__ x_dm, int *__restrict__ x_qh,
-                int *__restrict__ x_sc, const int &i_offset, const int &i_max,
-                const int &k, const int &blocks_per_row) {
-    (void)x_qh;
-
-    GGML_CUDA_ASSUME(i_offset >= 0);
-    GGML_CUDA_ASSUME(i_offset <  nwarps);
-    GGML_CUDA_ASSUME(k >= 0);
-    GGML_CUDA_ASSUME(k <  WARP_SIZE);
-
-    const int kbx  = k / QI6_K; // == 0 if QK_K == 256
-    const int kqsx = k % QI6_K; // == k if QK_K == 256
-
-    const block_q6_K * bx0 = (const block_q6_K *) vx;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps) {
-        int i = i0 + i_offset;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbx;
-        const int ky = QR6_K*kqsx;
-
-        const int ql = get_int_from_uint8(bxi->ql, kqsx);
-        const int ql0 = (ql >> 0) & 0x0F0F0F0F;
-        const int ql1 = (ql >> 4) & 0x0F0F0F0F;
-
-        const int qh = get_int_from_uint8(bxi->qh, (QI6_K/4) * (kqsx / (QI6_K/2)) + kqsx % (QI6_K/4));
-        const int qh0 = ((qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4)))) << 4) & 0x30303030;
-        const int qh1 =  (qh >> (2 * ((kqsx % (QI6_K/2)) / (QI6_K/4))))       & 0x30303030;
-
-        const int kq0 = ky - ky % QI6_K + k % (QI6_K/2) + 0;
-        const int kq1 = ky - ky % QI6_K + k % (QI6_K/2) + (QI6_K/2);
-
-        x_ql[i * (2 * WARP_SIZE + 1) + kq0] =
-            dpct::vectorized_binary<sycl::char4>(ql0 | qh0, 0x20202020,
-                                                 dpct::sub_sat());
-        x_ql[i * (2 * WARP_SIZE + 1) + kq1] =
-            dpct::vectorized_binary<sycl::char4>(ql1 | qh1, 0x20202020,
-                                                 dpct::sub_sat());
-    }
-
-    const int blocks_per_tile_x_row = WARP_SIZE / QI6_K; // == 1 if QK_K == 256
-    const int kbxd = k % blocks_per_tile_x_row;          // == 0 if QK_K == 256
-    float * x_dmf = (float *) x_dm;
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * QI6_K) {
-        int i = (i0 + i_offset * QI6_K + k / blocks_per_tile_x_row) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + kbxd;
-
-        x_dmf[i * (WARP_SIZE/QI6_K) + i / QI6_K + kbxd] = bxi->d;
-    }
-
-#pragma unroll
-    for (int i0 = 0; i0 < mmq_y; i0 += nwarps * 8) {
-        int i = (i0 + i_offset * 8 + k / (WARP_SIZE/8)) % mmq_y;
-
-        if (need_check) {
-            i = sycl::min(i, i_max);
-        }
-
-        const block_q6_K * bxi = bx0 + i*blocks_per_row + (k % (WARP_SIZE/8)) / 4;
-
-        x_sc[i * (WARP_SIZE/8) + i / 8 + k % (WARP_SIZE/8)] = get_int_from_int8(bxi->scales, k % (QI6_K/8));
-    }
-}
-
-static __dpct_inline__ float vec_dot_q6_K_q8_1_mul_mat(
-    const int *__restrict__ x_ql, const sycl::half2 *__restrict__ x_dm,
-    const int *__restrict__ x_qh, const int *__restrict__ x_sc,
-    const int *__restrict__ y_qs, const sycl::half2 *__restrict__ y_ds,
-    const int &i, const int &j, const int &k) {
-    (void)x_qh;
-
-    const float * x_dmf = (const float *) x_dm;
-    const float * y_df  = (const float *) y_ds;
-
-    const int8_t * sc = ((const int8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/8]);
-
-    const int index_x = i * (QR6_K*WARP_SIZE + 1) +  QR6_K*k;
-    const int index_y = j * WARP_SIZE             + (QR6_K*k) % WARP_SIZE;
-    return vec_dot_q6_K_q8_1_impl_mmq(&x_ql[index_x], &y_qs[index_y], sc, x_dmf[i * (WARP_SIZE/QI6_K) + i/QI6_K], &y_df[index_y/QI8_1]);
-}
-
-template <int qk, int qr, int qi, bool need_sum, typename block_q_t, int mmq_x,
-          int mmq_y, int nwarps, allocate_tiles_cuda_t allocate_tiles,
-          load_tiles_cuda_t load_tiles, int vdr,
-          vec_dot_q_mul_mat_cuda_t vec_dot>
-/*
-DPCT1110:19: The total declared local variable size in device function mul_mat_q
-exceeds 128 bytes and may cause high register pressure. Consult with your
-hardware vendor to find the total register size available and adjust the code,
-or use smaller sub-group size to avoid high register pressure.
-*/
-static __dpct_inline__ void
-mul_mat_q(const void *__restrict__ vx, const void *__restrict__ vy,
-          float *__restrict__ dst, const int ncols_x, const int nrows_x,
-          const int ncols_y, const int nrows_y, const int nrows_dst,
-          const sycl::nd_item<3> &item_ct1, int *tile_y_qs,
-          sycl::half2 *tile_y_ds) {
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    const int blocks_per_row_x = ncols_x / qk;
-    const int blocks_per_col_y = nrows_y / QK8_1;
-    const int blocks_per_warp = WARP_SIZE / qi;
-
-    const int & ncols_dst = ncols_y;
-
-    const int row_dst_0 = item_ct1.get_group(2) * mmq_y;
-    const int & row_x_0 = row_dst_0;
-
-    const int col_dst_0 = item_ct1.get_group(1) * mmq_x;
-    const int & col_y_0 = col_dst_0;
-
-    int   * tile_x_ql = nullptr;
-    sycl::half2 *tile_x_dm = nullptr;
-    int   * tile_x_qh = nullptr;
-    int   * tile_x_sc = nullptr;
-
-    allocate_tiles(&tile_x_ql, &tile_x_dm, &tile_x_qh, &tile_x_sc);
-
-    float sum[mmq_y/WARP_SIZE][mmq_x/nwarps] = {{0.0f}};
-
-    for (int ib0 = 0; ib0 < blocks_per_row_x; ib0 += blocks_per_warp) {
-
-        load_tiles(x + row_x_0 * blocks_per_row_x + ib0, tile_x_ql, tile_x_dm,
-                   tile_x_qh, tile_x_sc, item_ct1.get_local_id(1),
-                   nrows_x - row_x_0 - 1, item_ct1.get_local_id(2),
-                   blocks_per_row_x);
-
-#pragma unroll
-        for (int ir = 0; ir < qr; ++ir) {
-            const int kqs = ir * WARP_SIZE + item_ct1.get_local_id(2);
-            const int kbxd = kqs / QI8_1;
-
-#pragma unroll
-            for (int i = 0; i < mmq_x; i += nwarps) {
-                const int col_y_eff = dpct::min(
-                    (unsigned int)(col_y_0 + item_ct1.get_local_id(1) + i),
-                    ncols_y - 1); // to prevent out-of-bounds memory accesses
-
-                const block_q8_1 * by0 = &y[col_y_eff*blocks_per_col_y + ib0 * (qk/QK8_1) + kbxd];
-
-                const int index_y = (item_ct1.get_local_id(1) + i) * WARP_SIZE +
-                                    kqs % WARP_SIZE;
-                tile_y_qs[index_y] = get_int_from_int8_aligned(
-                    by0->qs, item_ct1.get_local_id(2) % QI8_1);
-            }
-
-#pragma unroll
-            for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
-                const int ids =
-                    (ids0 + item_ct1.get_local_id(1) * QI8_1 +
-                     item_ct1.get_local_id(2) / (WARP_SIZE / QI8_1)) %
-                    mmq_x;
-                const int kby = item_ct1.get_local_id(2) % (WARP_SIZE / QI8_1);
-                const int col_y_eff = sycl::min(col_y_0 + ids, ncols_y - 1);
-
-                // if the sum is not needed it's faster to transform the scale to f32 ahead of time
-                const sycl::half2 *dsi_src =
-                    &y[col_y_eff * blocks_per_col_y + ib0 * (qk / QK8_1) +
-                       ir * (WARP_SIZE / QI8_1) + kby]
-                         .ds;
-                sycl::half2 *dsi_dst =
-                    &tile_y_ds[ids * (WARP_SIZE / QI8_1) + kby];
-                if (need_sum) {
-                    *dsi_dst = *dsi_src;
-                } else {
-                    float * dfi_dst = (float *) dsi_dst;
-                    *dfi_dst = (*dsi_src)[1];
-                }
-            }
-
-            /*
-            DPCT1118:20: SYCL group functions and algorithms must be encountered
-            in converged control flow. You may need to adjust the code.
-            */
-            /*
-            DPCT1065:85: Consider replacing sycl::nd_item::barrier() with
-            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-            better performance if there is no access to global memory.
-            */
-            item_ct1.barrier();
-
-// #pragma unroll // unrolling this loop causes too much register pressure
-            for (int k = ir*WARP_SIZE/qr; k < (ir+1)*WARP_SIZE/qr; k += vdr) {
-#pragma unroll
-                for (int j = 0; j < mmq_x; j += nwarps) {
-#pragma unroll
-                    for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-                        sum[i / WARP_SIZE][j / nwarps] += vec_dot(
-                            tile_x_ql, tile_x_dm, tile_x_qh, tile_x_sc,
-                            tile_y_qs, tile_y_ds, item_ct1.get_local_id(2) + i,
-                            item_ct1.get_local_id(1) + j, k);
-                    }
-                }
-            }
-
-            /*
-            DPCT1118:21: SYCL group functions and algorithms must be encountered
-            in converged control flow. You may need to adjust the code.
-            */
-            /*
-            DPCT1065:86: Consider replacing sycl::nd_item::barrier() with
-            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-            better performance if there is no access to global memory.
-            */
-            item_ct1.barrier();
-        }
-    }
-
-#pragma unroll
-    for (int j = 0; j < mmq_x; j += nwarps) {
-        const int col_dst = col_dst_0 + j + item_ct1.get_local_id(1);
-
-        if (col_dst >= ncols_dst) {
-            return;
-        }
-
-#pragma unroll
-        for (int i = 0; i < mmq_y; i += WARP_SIZE) {
-            const int row_dst = row_dst_0 + item_ct1.get_local_id(2) + i;
-
-            if (row_dst >= nrows_dst) {
-                continue;
-            }
-
-            dst[col_dst*nrows_dst + row_dst] = sum[i/WARP_SIZE][j/nwarps];
-        }
-    }
-}
-
-#define  MMQ_X_Q4_0_RDNA2  64
-#define  MMQ_Y_Q4_0_RDNA2  128
-#define NWARPS_Q4_0_RDNA2  8
-#define  MMQ_X_Q4_0_RDNA1  64
-#define  MMQ_Y_Q4_0_RDNA1  64
-#define NWARPS_Q4_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_0_AMPERE 4
-#define  MMQ_Y_Q4_0_AMPERE 32
-#define NWARPS_Q4_0_AMPERE 4
-#else
-#define  MMQ_X_Q4_0_AMPERE 64
-#define  MMQ_Y_Q4_0_AMPERE 128
-#define NWARPS_Q4_0_AMPERE 4
-#endif
-#define  MMQ_X_Q4_0_PASCAL 64
-#define  MMQ_Y_Q4_0_PASCAL 64
-#define NWARPS_Q4_0_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q4_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-    const int nwarps = NWARPS_Q4_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-    const int nwarps = NWARPS_Q4_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-    const int nwarps = NWARPS_Q4_0_AMPERE;
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-    const int nwarps = NWARPS_Q4_0_PASCAL;
-
-    mul_mat_q<QK4_0, QR4_0, QI4_0, true, block_q4_0, mmq_x, mmq_y, nwarps, allocate_tiles_q4_0<mmq_y>,
-        load_tiles_q4_0<mmq_y, nwarps, need_check>, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q4_0_q8_1_mul_mat;
-    /*
-    DPCT1007:87: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q4_1_RDNA2  64
-#define  MMQ_Y_Q4_1_RDNA2  128
-#define NWARPS_Q4_1_RDNA2  8
-#define  MMQ_X_Q4_1_RDNA1  64
-#define  MMQ_Y_Q4_1_RDNA1  64
-#define NWARPS_Q4_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_1_AMPERE 4
-#define  MMQ_Y_Q4_1_AMPERE 32
-#define NWARPS_Q4_1_AMPERE 4
-#else
-#define  MMQ_X_Q4_1_AMPERE 64
-#define  MMQ_Y_Q4_1_AMPERE 128
-#define NWARPS_Q4_1_AMPERE 4
-#endif
-#define  MMQ_X_Q4_1_PASCAL 64
-#define  MMQ_Y_Q4_1_PASCAL 64
-#define NWARPS_Q4_1_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-    const int nwarps = NWARPS_Q4_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-    const int nwarps = NWARPS_Q4_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-    const int nwarps = NWARPS_Q4_1_AMPERE;
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-    const int nwarps = NWARPS_Q4_1_PASCAL;
-
-    mul_mat_q<QK4_1, QR4_1, QI4_1, true, block_q4_1, mmq_x, mmq_y, nwarps, allocate_tiles_q4_1<mmq_y>,
-        load_tiles_q4_1<mmq_y, nwarps, need_check>, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q4_1_q8_1_mul_mat;
-    /*
-    DPCT1007:88: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_0_RDNA2  64
-#define  MMQ_Y_Q5_0_RDNA2  128
-#define NWARPS_Q5_0_RDNA2  8
-#define  MMQ_X_Q5_0_RDNA1  64
-#define  MMQ_Y_Q5_0_RDNA1  64
-#define NWARPS_Q5_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_0_AMPERE 4
-#define  MMQ_Y_Q5_0_AMPERE 32
-#define NWARPS_Q5_0_AMPERE 4
-#else
-#define  MMQ_X_Q5_0_AMPERE 128
-#define  MMQ_Y_Q5_0_AMPERE 64
-#define NWARPS_Q5_0_AMPERE 4
-#endif
-#define  MMQ_X_Q5_0_PASCAL 64
-#define  MMQ_Y_Q5_0_PASCAL 64
-#define NWARPS_Q5_0_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q5_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-    const int nwarps = NWARPS_Q5_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-    const int nwarps = NWARPS_Q5_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-    const int nwarps = NWARPS_Q5_0_AMPERE;
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-    const int nwarps = NWARPS_Q5_0_PASCAL;
-
-    mul_mat_q<QK5_0, QR5_0, QI5_0, false, block_q5_0, mmq_x, mmq_y, nwarps, allocate_tiles_q5_0<mmq_y>,
-        load_tiles_q5_0<mmq_y, nwarps, need_check>, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q5_0_q8_1_mul_mat;
-    /*
-    DPCT1007:89: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_1_RDNA2  64
-#define  MMQ_Y_Q5_1_RDNA2  128
-#define NWARPS_Q5_1_RDNA2  8
-#define  MMQ_X_Q5_1_RDNA1  64
-#define  MMQ_Y_Q5_1_RDNA1  64
-#define NWARPS_Q5_1_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_1_AMPERE 4
-#define  MMQ_Y_Q5_1_AMPERE 32
-#define NWARPS_Q5_1_AMPERE 4
-#else
-#define  MMQ_X_Q5_1_AMPERE 128
-#define  MMQ_Y_Q5_1_AMPERE 64
-#define NWARPS_Q5_1_AMPERE 4
-#endif
-#define  MMQ_X_Q5_1_PASCAL 64
-#define  MMQ_Y_Q5_1_PASCAL 64
-#define NWARPS_Q5_1_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_1_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_1(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-    const int nwarps = NWARPS_Q5_1_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_1_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-    const int nwarps = NWARPS_Q5_1_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_1_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-    const int nwarps = NWARPS_Q5_1_AMPERE;
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_1_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-    const int nwarps = NWARPS_Q5_1_PASCAL;
-
-    mul_mat_q<QK5_1, QR5_1, QI5_1, true, block_q5_1, mmq_x, mmq_y, nwarps, allocate_tiles_q5_1<mmq_y>,
-        load_tiles_q5_1<mmq_y, nwarps, need_check>, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q5_1_q8_1_mul_mat;
-    /*
-    DPCT1007:90: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q8_0_RDNA2  64
-#define  MMQ_Y_Q8_0_RDNA2  128
-#define NWARPS_Q8_0_RDNA2  8
-#define  MMQ_X_Q8_0_RDNA1  64
-#define  MMQ_Y_Q8_0_RDNA1  64
-#define NWARPS_Q8_0_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q8_0_AMPERE 4
-#define  MMQ_Y_Q8_0_AMPERE 32
-#define NWARPS_Q8_0_AMPERE 4
-#else
-#define  MMQ_X_Q8_0_AMPERE 128
-#define  MMQ_Y_Q8_0_AMPERE 64
-#define NWARPS_Q8_0_AMPERE 4
-#endif
-#define  MMQ_X_Q8_0_PASCAL 64
-#define  MMQ_Y_Q8_0_PASCAL 64
-#define NWARPS_Q8_0_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q8_0_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    mul_mat_q8_0(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-    const int nwarps = NWARPS_Q8_0_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q8_0_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-    const int nwarps = NWARPS_Q8_0_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q8_0_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-    const int nwarps = NWARPS_Q8_0_AMPERE;
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q8_0_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-    const int nwarps = NWARPS_Q8_0_PASCAL;
-
-    mul_mat_q<QK8_0, QR8_0, QI8_0, false, block_q8_0, mmq_x, mmq_y, nwarps, allocate_tiles_q8_0<mmq_y>,
-        load_tiles_q8_0<mmq_y, nwarps, need_check>, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q8_0_q8_1_mul_mat;
-    /*
-    DPCT1007:91: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q2_K_RDNA2  64
-#define  MMQ_Y_Q2_K_RDNA2  128
-#define NWARPS_Q2_K_RDNA2  8
-#define  MMQ_X_Q2_K_RDNA1  128
-#define  MMQ_Y_Q2_K_RDNA1  32
-#define NWARPS_Q2_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q2_K_AMPERE 4
-#define  MMQ_Y_Q2_K_AMPERE 32
-#define NWARPS_Q2_K_AMPERE 4
-#else
-#define  MMQ_X_Q2_K_AMPERE 64
-#define  MMQ_Y_Q2_K_AMPERE 128
-#define NWARPS_Q2_K_AMPERE 4
-#endif
-#define  MMQ_X_Q2_K_PASCAL 64
-#define  MMQ_Y_Q2_K_PASCAL 64
-#define NWARPS_Q2_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q2_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q2_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-    const int nwarps = NWARPS_Q2_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q2_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-    const int nwarps = NWARPS_Q2_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q2_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-    const int nwarps = NWARPS_Q2_K_AMPERE;
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q2_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-    const int nwarps = NWARPS_Q2_K_PASCAL;
-
-    mul_mat_q<QK_K, QR2_K, QI2_K, false, block_q2_K, mmq_x, mmq_y, nwarps, allocate_tiles_q2_K<mmq_y>,
-        load_tiles_q2_K<mmq_y, nwarps, need_check>, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q2_K_q8_1_mul_mat;
-    /*
-    DPCT1007:92: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q3_K_RDNA2  128
-#define  MMQ_Y_Q3_K_RDNA2  64
-#define NWARPS_Q3_K_RDNA2  8
-#define  MMQ_X_Q3_K_RDNA1  32
-#define  MMQ_Y_Q3_K_RDNA1  128
-#define NWARPS_Q3_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q3_K_AMPERE 4
-#define  MMQ_Y_Q3_K_AMPERE 32
-#define NWARPS_Q3_K_AMPERE 4
-#else
-#define  MMQ_X_Q3_K_AMPERE 128
-#define  MMQ_Y_Q3_K_AMPERE 128
-#define NWARPS_Q3_K_AMPERE 4
-#endif
-#define  MMQ_X_Q3_K_PASCAL 64
-#define  MMQ_Y_Q3_K_PASCAL 64
-#define NWARPS_Q3_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q3_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q3_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-    const int nwarps = NWARPS_Q3_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q3_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-    const int nwarps = NWARPS_Q3_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q3_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-    const int nwarps = NWARPS_Q3_K_AMPERE;
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q3_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-    const int nwarps = NWARPS_Q3_K_PASCAL;
-
-    mul_mat_q<QK_K, QR3_K, QI3_K, false, block_q3_K, mmq_x, mmq_y, nwarps, allocate_tiles_q3_K<mmq_y>,
-        load_tiles_q3_K<mmq_y, nwarps, need_check>, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q3_K_q8_1_mul_mat;
-    /*
-    DPCT1007:93: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q4_K_RDNA2  64
-#define  MMQ_Y_Q4_K_RDNA2  128
-#define NWARPS_Q4_K_RDNA2  8
-#define  MMQ_X_Q4_K_RDNA1  32
-#define  MMQ_Y_Q4_K_RDNA1  64
-#define NWARPS_Q4_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q4_K_AMPERE 4
-#define  MMQ_Y_Q4_K_AMPERE 32
-#define NWARPS_Q4_K_AMPERE 4
-#else
-#define  MMQ_X_Q4_K_AMPERE 64
-#define  MMQ_Y_Q4_K_AMPERE 128
-#define NWARPS_Q4_K_AMPERE 4
-#endif
-#define  MMQ_X_Q4_K_PASCAL 64
-#define  MMQ_Y_Q4_K_PASCAL 64
-#define NWARPS_Q4_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q4_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q4_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-    const int nwarps = NWARPS_Q4_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q4_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-    const int nwarps = NWARPS_Q4_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q4_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-    const int nwarps = NWARPS_Q4_K_AMPERE;
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q4_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-    const int nwarps = NWARPS_Q4_K_PASCAL;
-
-    mul_mat_q<QK_K, QR4_K, QI4_K, true, block_q4_K, mmq_x, mmq_y, nwarps, allocate_tiles_q4_K<mmq_y>,
-        load_tiles_q4_K<mmq_y, nwarps, need_check>, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q4_K_q8_1_mul_mat;
-    /*
-    DPCT1007:94: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q5_K_RDNA2  64
-#define  MMQ_Y_Q5_K_RDNA2  128
-#define NWARPS_Q5_K_RDNA2  8
-#define  MMQ_X_Q5_K_RDNA1  32
-#define  MMQ_Y_Q5_K_RDNA1  64
-#define NWARPS_Q5_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q5_K_AMPERE 4
-#define  MMQ_Y_Q5_K_AMPERE 32
-#define NWARPS_Q5_K_AMPERE 4
-#else
-#define  MMQ_X_Q5_K_AMPERE 64
-#define  MMQ_Y_Q5_K_AMPERE 128
-#define NWARPS_Q5_K_AMPERE 4
-#endif
-#define  MMQ_X_Q5_K_PASCAL 64
-#define  MMQ_Y_Q5_K_PASCAL 64
-#define NWARPS_Q5_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q5_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-mul_mat_q5_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-    const int nwarps = NWARPS_Q5_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q5_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-    const int nwarps = NWARPS_Q5_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q5_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-    const int nwarps = NWARPS_Q5_K_AMPERE;
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q5_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-    const int nwarps = NWARPS_Q5_K_PASCAL;
-
-    mul_mat_q<QK_K, QR5_K, QI5_K, true, block_q5_K, mmq_x, mmq_y, nwarps, allocate_tiles_q5_K<mmq_y>,
-        load_tiles_q5_K<mmq_y, nwarps, need_check>, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q5_K_q8_1_mul_mat;
-    /*
-    DPCT1007:95: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-#define  MMQ_X_Q6_K_RDNA2  64
-#define  MMQ_Y_Q6_K_RDNA2  128
-#define NWARPS_Q6_K_RDNA2  8
-#define  MMQ_X_Q6_K_RDNA1  32
-#define  MMQ_Y_Q6_K_RDNA1  64
-#define NWARPS_Q6_K_RDNA1  8
-#if defined(CUDA_USE_TENSOR_CORES)
-#define  MMQ_X_Q6_K_AMPERE 4
-#define  MMQ_Y_Q6_K_AMPERE 32
-#define NWARPS_Q6_K_AMPERE 4
-#else
-#define  MMQ_X_Q6_K_AMPERE 64
-#define  MMQ_Y_Q6_K_AMPERE 64
-#define NWARPS_Q6_K_AMPERE 4
-#endif
-#define  MMQ_X_Q6_K_PASCAL 64
-#define  MMQ_Y_Q6_K_PASCAL 64
-#define NWARPS_Q6_K_PASCAL 8
-
-template <bool need_check> static void
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    __launch_bounds__(WARP_SIZE*NWARPS_Q6_K_RDNA2, 2)
-#endif // defined(RDNA3) || defined(RDNA2)
-#elif DPCT_COMPATIBILITY_TEMP < CC_VOLTA
-
-#endif // __CUDA_ARCH__ < CC_VOLTA
-    mul_mat_q6_K(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst) {
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-#if defined(RDNA3) || defined(RDNA2)
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA2;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-    const int nwarps = NWARPS_Q6_K_RDNA2;
-#else
-    const int mmq_x  =  MMQ_X_Q6_K_RDNA1;
-    const int mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-    const int nwarps = NWARPS_Q6_K_RDNA1;
-#endif // defined(RDNA3) || defined(RDNA2)
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= CC_VOLTA
-    const int mmq_x  =  MMQ_X_Q6_K_AMPERE;
-    const int mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-    const int nwarps = NWARPS_Q6_K_AMPERE;
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-
-#elif DPCT_COMPATIBILITY_TEMP >= MIN_CC_DP4A
-    const int mmq_x  =  MMQ_X_Q6_K_PASCAL;
-    const int mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-    const int nwarps = NWARPS_Q6_K_PASCAL;
-
-    mul_mat_q<QK_K, QR6_K, QI6_K, false, block_q6_K, mmq_x, mmq_y, nwarps, allocate_tiles_q6_K<mmq_y>,
-        load_tiles_q6_K<mmq_y, nwarps, need_check>, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat>
-        (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst);
-#else
-    (void) vec_dot_q6_K_q8_1_mul_mat;
-    /*
-    DPCT1007:96: Migration of __assert_fail is not supported.
-    */
-    assert(false);
-#endif // __CUDA_ARCH__ >= CC_VOLTA
-}
-
-template <int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
-static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst, const int ncols, const int nrows,
-                          const sycl::nd_item<3> &item_ct1) {
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int blocks_per_row = ncols / qk;
-    const int blocks_per_warp = vdr * WARP_SIZE / qi;
-
-// partial sum for each thread
-    float tmp = 0.0f;
-
-    const block_q_t  * x = (const block_q_t  *) vx;
-    const block_q8_1 * y = (const block_q8_1 *) vy;
-
-    for (int i = 0; i < blocks_per_row; i += blocks_per_warp) {
-        const int ibx = row * blocks_per_row + i +
-                        item_ct1.get_local_id(2) / (qi / vdr); // x block index
-
-        const int iby = (i + item_ct1.get_local_id(2) / (qi / vdr)) *
-                        (qk / QK8_1); // y block index that aligns with ibx
-
-        const int iqs =
-            vdr *
-            (item_ct1.get_local_id(2) %
-             (qi / vdr)); // x block quant index when casting the quants to int
-
-        tmp += vec_dot_q_cuda(&x[ibx], &y[iby], iqs);
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:22: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[row] = tmp;
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel>
-static void dequantize_mul_mat_vec(const void * __restrict__ vx, const dfloat * __restrict__ y, float * __restrict__ dst, const int ncols, const int nrows,
-                                   const sycl::nd_item<3> &item_ct1) {
-    // qk = quantized weights per x block
-    // qr = number of quantized weights per data value in x block
-    const int row = item_ct1.get_group(2) * item_ct1.get_local_range(1) +
-                    item_ct1.get_local_id(1);
-
-    if (row >= nrows) {
-        return;
-    }
-
-    const int tid = item_ct1.get_local_id(2);
-
-    const int iter_stride = 2*GGML_CUDA_DMMV_X;
-    const int vals_per_iter = iter_stride / WARP_SIZE; // num quantized vals per thread and i iter
-    const int y_offset = qr == 1 ? 1 : qk/2;
-
-// partial sum for each thread
-#ifdef GGML_CUDA_F16
-    half2 tmp = {0.0f, 0.0f}; // two sums for f16 to take advantage of half2 intrinsics
-#else
-    float tmp = 0.0f;
-#endif // GGML_CUDA_F16
-
-    for (int i = 0; i < ncols; i += iter_stride) {
-        const int col = i + vals_per_iter*tid;
-        const int ib = (row*ncols + col)/qk; // x block index
-        const int iqs = (col%qk)/qr; // x quant index
-        const int iybs = col - col%qk; // y block start index
-
-// processing >2 values per i iter is faster for fast GPUs
-#pragma unroll
-        for (int j = 0; j < vals_per_iter; j += 2) {
-            // process 2 vals per j iter
-
-            // dequantize
-            // for qr = 2 the iqs needs to increase by 1 per j iter because 2 weights per data val
-            dfloat2 v;
-            dequantize_kernel(vx, ib, iqs + j/qr, v);
-
-            // matrix multiplication
-            // for qr = 2 the y index needs to increase by 1 per j iter because of y_offset = qk/2
-#ifdef GGML_CUDA_F16
-            tmp += __hmul2(v, {
-                y[iybs + iqs + j/qr + 0],
-                y[iybs + iqs + j/qr + y_offset]
-            });
-#else
-            tmp += v.x() * y[iybs + iqs + j / qr + 0];
-            tmp += v.y() * y[iybs + iqs + j / qr + y_offset];
-#endif // GGML_CUDA_F16
-        }
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:23: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (tid == 0) {
-#ifdef GGML_CUDA_F16
-        dst[row] = tmp.x + tmp.y;
-#else
-        dst[row] = tmp;
-#endif // GGML_CUDA_F16
-    }
-}
-
-static void mul_mat_p021_f16_f32(
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y,
-    const sycl::nd_item<3> &item_ct1) {
-
-    const sycl::half *x = (const sycl::half *)vx;
-
-    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                      item_ct1.get_local_id(1);
-    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                        item_ct1.get_local_id(0);
-    const int channel_x = channel / (nchannels_y / nchannels_x);
-
-    const int nrows_y = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst = row_x;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x;
-         col_x0 += item_ct1.get_local_range(2)) {
-        const int col_x = col_x0 + item_ct1.get_local_id(2);
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        // x is transposed and permuted
-        const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
-        const float xi =
-            sycl::vec<sycl::half, 1>{x[ix]}
-                .convert<float, sycl::rounding_mode::automatic>()[0];
-
-        const int row_y = col_x;
-
-
-        // y is not transposed but permuted
-        const int iy = channel*nrows_y + row_y;
-
-        tmp += xi * y[iy];
-    }
-
-    // dst is not transposed and not permuted
-    const int idst = channel*nrows_dst + row_dst;
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:24: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
-    const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
-    const int row_stride_x, const int channel_stride_x, const int channel_x_divisor,
-    const sycl::nd_item<3> &item_ct1) {
-
-    const sycl::half *x = (const sycl::half *)vx;
-
-    const int row_x = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                      item_ct1.get_local_id(1);
-    const int channel = item_ct1.get_local_range(0) * item_ct1.get_group(0) +
-                        item_ct1.get_local_id(0);
-    const int channel_x = channel / channel_x_divisor;
-
-    const int nrows_y   = ncols_x;
-    const int nrows_dst = nrows_x;
-    const int row_dst   = row_x;
-
-    const int idst = channel*nrows_dst + row_dst;
-
-    float tmp = 0.0f;
-
-    for (int col_x0 = 0; col_x0 < ncols_x;
-         col_x0 += item_ct1.get_local_range(2)) {
-        const int col_x = col_x0 + item_ct1.get_local_id(2);
-
-        if (col_x >= ncols_x) {
-            break;
-        }
-
-        const int row_y = col_x;
-
-        const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
-        const int iy = channel*nrows_y + row_y;
-
-        const float xi =
-            sycl::vec<sycl::half, 1>{x[ix]}
-                .convert<float, sycl::rounding_mode::automatic>()[0];
-
-        tmp += xi * y[iy];
-    }
-
-    // sum up partial sums and write back result
-#pragma unroll
-    for (int mask = 16; mask > 0; mask >>= 1) {
-        /*
-        DPCT1023:25: The SYCL sub-group does not support mask options for
-        dpct::permute_sub_group_by_xor. You can specify
-        "--use-experimental-features=masked-sub-group-operation" to use the
-        experimental helper function to migrate __shfl_xor_sync.
-        */
-        tmp +=
-            dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
-    }
-
-    if (item_ct1.get_local_id(2) == 0) {
-        dst[idst] = tmp;
-    }
-}
-
-static void cpy_1_f32_f32(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    float * dsti = (float *) cdsti;
-
-    *dsti = *xi;
-}
-
-static void cpy_1_f32_f16(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    sycl::half *dsti = (sycl::half *)cdsti;
-
-    *dsti = sycl::vec<float, 1>{(*xi)}
-                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
-}
-
-static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
-    const sycl::half *xi = (const sycl::half *)cxi;
-    sycl::half *dsti = (sycl::half *)cdsti;
-
-    *dsti = *xi;
-}
-
-template <cpy_kernel_t cpy_1>
-static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
-                                   const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-                                   const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
-                                   const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= ne) {
-        return;
-    }
-
-    // determine indices i02/i12, i01/i11, i00/i10 as a function of index i of flattened tensor
-    // then combine those indices with the corresponding byte offsets to get the total offsets
-    const int i02 = i / (ne00*ne01);
-    const int i01 = (i - i02*ne01*ne00) / ne00;
-    const int i00 = i - i02*ne01*ne00 - i01*ne00;
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
-
-    const int i12 = i / (ne10*ne11);
-    const int i11 = (i - i12*ne10*ne11) / ne10;
-    const int i10 = i - i12*ne10*ne11 - i11*ne10;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
-
-    cpy_1(cx + x_offset, cdst + dst_offset);
-}
-
-static void cpy_blck_f32_q8_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q8_0 * dsti = (block_q8_0 *) cdsti;
-
-    float amax = 0.0f; // absolute max
-
-    for (int j = 0; j < QK8_0; j++) {
-        const float v = xi[j];
-        amax = sycl::fmax(amax, sycl::fabs((float)v));
-    }
-
-    const float d = amax / ((1 << 7) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK8_0; ++j) {
-        const float x0 = xi[j]*id;
-
-        dsti->qs[j] = sycl::round((float)x0);
-    }
-}
-
-static void cpy_blck_f32_q4_0(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_0 * dsti = (block_q4_0 *) cdsti;
-
-    float amax = 0.0f;
-    float vmax = 0.0f;
-
-    for (int j = 0; j < QK4_0; ++j) {
-        const float v = xi[j];
-        if (amax < sycl::fabs((float)v)) {
-            amax = sycl::fabs((float)v);
-            vmax = v;
-        }
-    }
-
-    const float d  = vmax / -8;
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->d = d;
-
-    for (int j = 0; j < QK4_0/2; ++j) {
-        const float x0 = xi[0       + j]*id;
-        const float x1 = xi[QK4_0/2 + j]*id;
-
-        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 8.5f));
-        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 8.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-static void cpy_blck_f32_q4_1(const char * cxi, char * cdsti) {
-    const float * xi = (const float *) cxi;
-    block_q4_1 * dsti = (block_q4_1 *) cdsti;
-
-    float vmin = FLT_MAX;
-    float vmax = -FLT_MAX;
-
-    for (int j = 0; j < QK4_1; ++j) {
-        const float v = xi[j];
-
-        if (v < vmin) vmin = v;
-        if (v > vmax) vmax = v;
-    }
-
-    const float d  = (vmax - vmin) / ((1 << 4) - 1);
-    const float id = d ? 1.0f/d : 0.0f;
-
-    dsti->dm.x() = d;
-    dsti->dm.y() = vmin;
-
-    for (int j = 0; j < QK4_1/2; ++j) {
-        const float x0 = (xi[0       + j] - vmin)*id;
-        const float x1 = (xi[QK4_1/2 + j] - vmin)*id;
-
-        const uint8_t xi0 = dpct::min(15, (int8_t)(x0 + 0.5f));
-        const uint8_t xi1 = dpct::min(15, (int8_t)(x1 + 0.5f));
-
-        dsti->qs[j]  = xi0;
-        dsti->qs[j] |= xi1 << 4;
-    }
-}
-
-template <cpy_kernel_t cpy_blck, int qk>
-static void cpy_f32_q(const char * cx, char * cdst, const int ne,
-                                 const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
-                                 const int ne10, const int ne11, const int nb10, const int nb11, const int nb12,
-                                 const sycl::nd_item<3> &item_ct1) {
-    const int i = (item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                   item_ct1.get_local_id(2)) *
-                  qk;
-
-    if (i >= ne) {
-        return;
-    }
-
-    const int i02 = i / (ne00*ne01);
-    const int i01 = (i - i02*ne01*ne00) / ne00;
-    const int i00 = (i - i02*ne01*ne00 - i01*ne00);
-    const int x_offset = i00*nb00 + i01*nb01 + i02*nb02;
-
-    const int i12 = i / (ne10*ne11);
-    const int i11 = (i - i12*ne10*ne11) / ne10;
-    const int i10 = (i - i12*ne10*ne11 - i11*ne10)/qk;
-    const int dst_offset = i10*nb10 + i11*nb11 + i12*nb12;
-
-    cpy_blck(cx + x_offset, cdst + dst_offset);
-}
-
-static float rope_yarn_ramp(const float low, const float high, const int i0) {
-    const float y = (i0 / 2 - low) / sycl::max(0.001f, high - low);
-    return 1.0f - sycl::min(1.0f, sycl::max(0.0f, y));
-}
-
-struct rope_corr_dims {
-    float v[4];
-};
-
-// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn
-// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng.
-static void rope_yarn(
-    float theta_extrap, float freq_scale, rope_corr_dims corr_dims, int64_t i0, float ext_factor, float mscale,
-    float * cos_theta, float * sin_theta
-) {
-    // Get n-d rotational scaling corrected for extrapolation
-    float theta_interp = freq_scale * theta_extrap;
-    float theta = theta_interp;
-    if (ext_factor != 0.0f) {
-        float ramp_mix = rope_yarn_ramp(corr_dims.v[0], corr_dims.v[1], i0) * ext_factor;
-        theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix;
-
-        // Get n-d magnitude scaling corrected for interpolation
-        mscale *= 1.0f + 0.1f * sycl::log(1.0f / freq_scale);
-    }
-    *cos_theta = sycl::cos(theta) * mscale;
-    *sin_theta = sycl::sin(theta) * mscale;
-}
-
-// rope == RoPE == rotary positional embedding
-template<typename T, bool has_pos>
-static void rope(
-    const T * x, T * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims
-,
-    const sycl::nd_item<3> &item_ct1) {
-    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                         item_ct1.get_local_id(1));
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base = p * dpct::pow(freq_base, -float(col) / ncols);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, col, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + 1];
-
-    dst[i + 0] = x0*cos_theta - x1*sin_theta;
-    dst[i + 1] = x0*sin_theta + x1*cos_theta;
-}
-
-template<typename T, bool has_pos>
-static void rope_neox(
-    const T * x, T * dst, int ncols, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, float inv_ndims
-,
-    const sycl::nd_item<3> &item_ct1) {
-    const int col = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                         item_ct1.get_local_id(1));
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int ib = col / n_dims;
-    const int ic = col % n_dims;
-
-    const int i = row*ncols + ib*n_dims + ic/2;
-    const int i2 = row/p_delta_rows;
-
-    float cur_rot = inv_ndims * ic - ib;
-
-    const int p = has_pos ? pos[i2] : 0;
-    const float theta_base =
-        p * freq_scale * dpct::pow(theta_scale, col / 2.0f);
-
-    float cos_theta, sin_theta;
-    rope_yarn(theta_base, freq_scale, corr_dims, cur_rot, ext_factor, attn_factor, &cos_theta, &sin_theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims/2];
-
-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
-}
-
-static void rope_glm_f32(
-    const float * x, float * dst, int ncols, const int32_t * pos, float freq_scale, int p_delta_rows, float freq_base,
-    int n_ctx
-, const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-    const int half_n_dims = ncols/4;
-
-    if (col >= half_n_dims) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i = row*ncols + col;
-    const int i2 = row/p_delta_rows;
-
-    const float col_theta_scale = dpct::pow(freq_base, -2.0f * col / ncols);
-     // FIXME: this is likely wrong
-    const int p = pos != nullptr ? pos[i2] : 0;
-
-    const float theta = sycl::min(p, n_ctx - 2) * freq_scale * col_theta_scale;
-    const float sin_theta = sycl::sin((float)theta);
-    const float cos_theta = sycl::cos((float)theta);
-
-    const float x0 = x[i + 0];
-    const float x1 = x[i + half_n_dims];
-
-    dst[i + 0]           = x0*cos_theta - x1*sin_theta;
-    dst[i + half_n_dims] = x0*sin_theta + x1*cos_theta;
-
-    const float block_theta =
-        ((float)sycl::max(p - n_ctx - 2, 0)) * col_theta_scale;
-    const float sin_block_theta = sycl::sin((float)block_theta);
-    const float cos_block_theta = sycl::cos((float)block_theta);
-
-    const float x2 = x[i + half_n_dims * 2];
-    const float x3 = x[i + half_n_dims * 3];
-
-    dst[i + half_n_dims * 2] = x2*cos_block_theta - x3*sin_block_theta;
-    dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
-}
-
-static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
-                                 const int n_heads_log2_floor, const float m0, const float m1,
-                                 const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int i = row*ncols + col;
-
-    const int k = row/k_rows;
-
-    float m_k;
-    if (k < n_heads_log2_floor) {
-        m_k = dpct::pow(m0, k + 1);
-    } else {
-        m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
-    }
-
-    dst[i] = col * m_k + x[i];
-}
-
-static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int row = item_ct1.get_group(1);
-    const int col = item_ct1.get_local_id(2);
-
-    float sum = 0.0f;
-    for (int i = col; i < ncols; i += item_ct1.get_local_range(2)) {
-        sum += x[row * ncols + i];
-    }
-
-    sum = warp_reduce_sum(sum, item_ct1);
-
-    if (col == 0) {
-        dst[row] = sum;
-    }
-}
-
-template<typename T>
-static inline void swap(T & a, T & b) {
-    T tmp = a;
-    a = b;
-    b = tmp;
-}
-
-template<ggml_sort_order order>
-static void k_argsort_f32_i32(const float * x, int * dst, const int ncols,
-                              const sycl::nd_item<3> &item_ct1) {
-    // bitonic sort
-    int col = item_ct1.get_local_id(2);
-    int row = item_ct1.get_group(1);
-
-    if (col >= ncols) return;
-
-    const float * x_row = x + row * ncols;
-    int * dst_row = dst + row * ncols;
-
-    // initialize indices
-    if (col < ncols) {
-        dst_row[col] = col;
-    }
-    /*
-    DPCT1065:97: Consider replacing sycl::nd_item::barrier() with
-    sycl::nd_item::barrier(sycl::access::fence_space::local_space) for better
-    performance if there is no access to global memory.
-    */
-    item_ct1.barrier();
-
-    for (int k = 2; k <= ncols; k *= 2) {
-        for (int j = k / 2; j > 0; j /= 2) {
-            int ixj = col ^ j;
-            if (ixj > col) {
-                if ((col & k) == 0) {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] > x_row[dst_row[ixj]] : x_row[dst_row[col]] < x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                } else {
-                    if (order == GGML_SORT_ASC ? x_row[dst_row[col]] < x_row[dst_row[ixj]] : x_row[dst_row[col]] > x_row[dst_row[ixj]]) {
-                        swap(dst_row[col], dst_row[ixj]);
-                    }
-                }
-            }
-            /*
-            DPCT1118:26: SYCL group functions and algorithms must be encountered
-            in converged control flow. You may need to adjust the code.
-            */
-            /*
-            DPCT1065:98: Consider replacing sycl::nd_item::barrier() with
-            sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-            better performance if there is no access to global memory.
-            */
-            item_ct1.barrier();
-        }
-    }
-}
-
-static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, const int rows_per_channel, const int n_past,
-                              const sycl::nd_item<3> &item_ct1) {
-    const int col = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                    item_ct1.get_local_id(1);
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
-
-    if (col >= ncols) {
-        return;
-    }
-
-    const int i = row*ncols + col;
-    //dst[i] = col > (n_past + row % rows_per_channel) ? -INFINITY : x[i];
-    //dst[i] = x[i] - (col > n_past + row % rows_per_channel) * INT_MAX; // equivalent within rounding error but slightly faster on GPU
-    dst[i] = x[i] - (col > n_past + row % rows_per_channel) * FLT_MAX;
-}
-
-static void soft_max_f32(const float * x, const float * y, float * dst, const int ncols, const int nrows_y, const float scale,
-                         const sycl::nd_item<3> &item_ct1, float *buf) {
-    const int tid = item_ct1.get_local_id(2);
-    const int rowx = item_ct1.get_group(2);
-    const int rowy = rowx % nrows_y; // broadcast the mask (y) in the row dimension
-
-    const int block_size = item_ct1.get_local_range(2);
-
-    const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
-    const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
-
-    float max_val = -INFINITY;
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const int ix = rowx*ncols + col;
-        const int iy = rowy*ncols + col;
-        max_val = sycl::max(max_val, x[ix] * scale + (y ? y[iy] : 0.0f));
-    }
-
-    // find the max value in the block
-    max_val = warp_reduce_max(max_val, item_ct1);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf[lane_id] = -INFINITY;
-        }
-        /*
-        DPCT1118:27: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:99: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-
-        if (lane_id == 0) {
-            buf[warp_id] = max_val;
-        }
-        /*
-        DPCT1118:28: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:100: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-
-        max_val = buf[lane_id];
-        max_val = warp_reduce_max(max_val, item_ct1);
-    }
-
-    float tmp = 0.f;
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const int ix = rowx*ncols + col;
-        const int iy = rowy*ncols + col;
-        const float val =
-            sycl::native::exp((x[ix] * scale + (y ? y[iy] : 0.0f)) - max_val);
-        tmp += val;
-        dst[ix] = val;
-    }
-
-    // find the sum of exps in the block
-    tmp = warp_reduce_sum(tmp, item_ct1);
-    if (block_size > WARP_SIZE) {
-        if (warp_id == 0) {
-            buf[lane_id] = 0.f;
-        }
-        /*
-        DPCT1118:29: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:101: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-
-        if (lane_id == 0) {
-            buf[warp_id] = tmp;
-        }
-        /*
-        DPCT1118:30: SYCL group functions and algorithms must be encountered in
-        converged control flow. You may need to adjust the code.
-        */
-        /*
-        DPCT1065:102: Consider replacing sycl::nd_item::barrier() with
-        sycl::nd_item::barrier(sycl::access::fence_space::local_space) for
-        better performance if there is no access to global memory.
-        */
-        item_ct1.barrier();
-
-        tmp = buf[lane_id];
-        tmp = warp_reduce_sum(tmp, item_ct1);
-    }
-
-    const float inv_tmp = 1.f / tmp;
-
-    for (int col = tid; col < ncols; col += block_size) {
-        const int i = rowx*ncols + col;
-        dst[i] *= inv_tmp;
-    }
-}
-
-static void scale_f32(const float * x, float * dst, const float scale, const int k,
-                      const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = scale * x[i];
-}
-
-static void clamp_f32(const float * x, float * dst, const float min, const float max, const int k,
-                      const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                  item_ct1.get_local_id(2);
-
-    if (i >= k) {
-        return;
-    }
-
-    dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
-}
-
-static void im2col_f32_f16(const float *x, sycl::half *dst, int offset_delta,
-                           int IW, int IH, int OW, int KW, int KH,
-                           int pelements, int CHW, int s0, int s1, int p0,
-                           int p1, int d0, int d1,
-                           const sycl::nd_item<3> &item_ct1) {
-    const int i = item_ct1.get_local_id(2) +
-                  item_ct1.get_group(2) * item_ct1.get_local_range(2);
-    if (i >= pelements) {
-        return;
-    }
-
-    const int ksize = OW * (KH > 1 ? KW : 1);
-    const int kx = i / ksize;
-    const int kd = kx * ksize;
-    const int ky = (i - kd) / OW;
-    const int ix = i % OW;
-
-    const int iiw = ix * s0 + kx * d0 - p0;
-    const int iih = item_ct1.get_group(1) * s1 + ky * d1 - p1;
-
-    const int offset_dst = (item_ct1.get_group(1) * OW + ix) * CHW +
-                           (item_ct1.get_group(0) * (KW * KH) + ky * KW + kx);
-
-    if (iih < 0 || iih >= IH || iiw < 0 || iiw >= IW) {
-        dst[offset_dst] =
-            sycl::vec<float, 1>{0.0f}
-                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
-    } else {
-        const int offset_src = item_ct1.get_group(0) * offset_delta;
-        dst[offset_dst] =
-            sycl::vec<float, 1>{x[offset_src + iih * IW + iiw]}
-                .convert<sycl::half, sycl::rounding_mode::automatic>()[0];
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dq>
-static void get_rows_cuda(const ggml_tensor *src0, const ggml_tensor *src1,
-                          ggml_tensor *dst, const void *src0_dd,
-                          const int32_t *src1_dd, float *dst_dd,
-                          dpct::queue_ptr stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
-    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    GGML_ASSERT(ne00 % 2 == 0);
-
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             k_get_rows<qk, qr, dq>(
-                                 src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-                         });
-
-    (void) dst;
-}
-
-template <typename src0_t>
-static void get_rows_cuda_float(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const src0_t *src0_dd, const int32_t *src1_dd,
-                                float *dst_dd, dpct::queue_ptr stream) {
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const sycl::range<3> block_dims(1, 1, CUDA_GET_ROWS_BLOCK_SIZE);
-    const int block_num_x = (ne00 + CUDA_GET_ROWS_BLOCK_SIZE - 1) / CUDA_GET_ROWS_BLOCK_SIZE;
-    const sycl::range<3> block_nums(ne11 * ne12, ne10, block_num_x);
-
-    // strides in elements
-    //const size_t s0 = nb0 / ggml_element_size(dst);
-    const size_t s1 = nb1 / ggml_element_size(dst);
-    const size_t s2 = nb2 / ggml_element_size(dst);
-    const size_t s3 = nb3 / ggml_element_size(dst);
-
-    const size_t s10 = nb10 / ggml_element_size(src1);
-    const size_t s11 = nb11 / ggml_element_size(src1);
-    const size_t s12 = nb12 / ggml_element_size(src1);
-    //const size_t s13 = nb13 / ggml_element_size(src1);
-
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_get_rows_float(src0_dd, src1_dd, dst_dd, ne00, ne12, s1, s2,
-                                 s3, nb01, nb02, nb03, s10, s11, s12, item_ct1);
-            });
-    }
-
-    (void) dst;
-}
-
-template<float (*bin_op)(const float, const float)>
-struct bin_bcast_cuda {
-    template <typename src0_t, typename src1_t, typename dst_t>
-    void operator()(const struct ggml_tensor *src0,
-                    const struct ggml_tensor *src1, struct ggml_tensor *dst,
-                    const src0_t *src0_dd, const src1_t *src1_dd, dst_t *dst_dd,
-                    dpct::queue_ptr stream) {
-
-        GGML_TENSOR_BINARY_OP_LOCALS
-
-        int nr0 = ne10/ne0;
-        int nr1 = ne11/ne1;
-        int nr2 = ne12/ne2;
-        int nr3 = ne13/ne3;
-
-        int nr[4] = { nr0, nr1, nr2, nr3 };
-
-        // collapse dimensions until first broadcast dimension
-        int64_t cne0[] = {ne0, ne1, ne2, ne3};
-        int64_t cne1[] = {ne10, ne11, ne12, ne13};
-        size_t cnb0[] = {nb0, nb1, nb2, nb3};
-        size_t cnb1[] = {nb10, nb11, nb12, nb13};
-        auto collapse = [](int64_t cne[]) {
-            cne[0] *= cne[1];
-            cne[1] = cne[2];
-            cne[2] = cne[3];
-            cne[3] = 1;
-        };
-
-        auto collapse_nb = [](size_t cnb[], int64_t cne[]) {
-            cnb[1] *= cne[1];
-            cnb[2] *= cne[2];
-            cnb[3] *= cne[3];
-        };
-
-        for (int i = 0; i < 4; i++) {
-            if (nr[i] != 1) {
-                break;
-            }
-            if (i > 0) {
-                collapse_nb(cnb0, cne0);
-                collapse_nb(cnb1, cne1);
-                collapse(cne0);
-                collapse(cne1);
-            }
-        }
-        {
-            int64_t ne0 = cne0[0];
-            int64_t ne1 = cne0[1];
-            int64_t ne2 = cne0[2];
-            int64_t ne3 = cne0[3];
-
-            int64_t ne10 = cne1[0];
-            int64_t ne11 = cne1[1];
-            int64_t ne12 = cne1[2];
-            int64_t ne13 = cne1[3];
-
-            size_t nb0 = cnb0[0];
-            size_t nb1 = cnb0[1];
-            size_t nb2 = cnb0[2];
-            size_t nb3 = cnb0[3];
-
-            size_t nb10 = cnb1[0];
-            size_t nb11 = cnb1[1];
-            size_t nb12 = cnb1[2];
-            size_t nb13 = cnb1[3];
-
-            size_t s0 = nb0 / sizeof(dst_t);
-            size_t s1 = nb1 / sizeof(dst_t);
-            size_t s2 = nb2 / sizeof(dst_t);
-            size_t s3 = nb3 / sizeof(dst_t);
-
-            size_t s10 = nb10 / sizeof(src1_t);
-            size_t s11 = nb11 / sizeof(src1_t);
-            size_t s12 = nb12 / sizeof(src1_t);
-            size_t s13 = nb13 / sizeof(src1_t);
-
-            GGML_ASSERT(s0 == 1);
-            GGML_ASSERT(s10 == 1);
-
-            const int block_size = 128;
-
-            int64_t hne0 = std::max(ne0/2LL, 1LL);
-
-            sycl::range<3> block_dims(1, 1, 1);
-            block_dims[2] = std::min<unsigned int>(hne0, block_size);
-            block_dims[1] =
-                std::min<unsigned int>(ne1, block_size / block_dims[2]);
-            block_dims[0] = std::min(
-                std::min<unsigned int>(ne2 * ne3, block_size / block_dims[2] /
-                                                      block_dims[1]),
-                64U);
-
-            sycl::range<3> block_nums(
-                (ne2 * ne3 + block_dims[0] - 1) / block_dims[0],
-                (ne1 + block_dims[1] - 1) / block_dims[1],
-                (hne0 + block_dims[2] - 1) / block_dims[2]);
-
-            if (block_nums[0] > 65535) {
-                // this is the maximum number of blocks in z direction, fallback to 1D grid kernel
-                int block_num = (ne0*ne1*ne2*ne3 + block_size - 1) / block_size;
-                {
-                    dpct::has_capability_or_fail(stream->get_device(),
-                                                 {sycl::aspect::fp16});
-                    stream->parallel_for(
-                        sycl::nd_range<3>(sycl::range<3>(1, 1, block_num) *
-                                              sycl::range<3>(1, 1, block_size),
-                                          sycl::range<3>(1, 1, block_size)),
-                        [=](sycl::nd_item<3> item_ct1) {
-                            k_bin_bcast_unravel<bin_op>(
-                                src0_dd, src1_dd, dst_dd, ne0, ne1, ne2, ne3,
-                                ne10, ne11, ne12, ne13, s1, s2, s3, s11, s12,
-                                s13, item_ct1);
-                        });
-                }
-            } else {
-                /*
-                DPCT1049:31: The work-group size passed to the SYCL kernel may
-                exceed the limit. To get the device limit, query
-                info::device::max_work_group_size. Adjust the work-group size if
-                needed.
-                */
-                dpct::has_capability_or_fail(stream->get_device(),
-                                             {sycl::aspect::fp16});
-                stream->parallel_for(
-                    sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                    [=](sycl::nd_item<3> item_ct1) {
-                        k_bin_bcast<bin_op>(src0_dd, src1_dd, dst_dd, ne0, ne1,
-                                            ne2, ne3, ne10, ne11, ne12, ne13,
-                                            s1, s2, s3, s11, s12, s13,
-                                            item_ct1);
-                    });
-            }
-        }
-    }
-};
-
-static void acc_f32_cuda(const float *x, const float *y, float *dst,
-                         const int n_elements, const int ne10, const int ne11,
-                         const int ne12, const int nb1, const int nb2,
-                         const int offset, dpct::queue_ptr stream) {
-    int num_blocks = (n_elements + CUDA_ACC_BLOCK_SIZE - 1) / CUDA_ACC_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_ACC_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            acc_f32(x, y, dst, n_elements, ne10, ne11, ne12, nb1, nb2, offset,
-                    item_ct1);
-        });
-}
-
-static void gelu_f32_cuda(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void silu_f32_cuda(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_SILU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            silu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void gelu_quick_f32_cuda(const float *x, float *dst, const int k,
-                                dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_GELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            gelu_quick_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void tanh_f32_cuda(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_TANH_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            tanh_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void relu_f32_cuda(const float *x, float *dst, const int k,
-                          dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            relu_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void leaky_relu_f32_cuda(const float *x, float *dst, const int k,
-                                const float negative_slope,
-                                dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_RELU_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            leaky_relu_f32(x, dst, k, negative_slope, item_ct1);
-        });
-}
-
-static void sqr_f32_cuda(const float *x, float *dst, const int k,
-                         dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_SQR_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            sqr_f32(x, dst, k, item_ct1);
-        });
-}
-
-static void norm_f32_cuda(const float *x, float *dst, const int ncols,
-                          const int nrows, const float eps,
-                          dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
-                sycl::range<1>(32), cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
-                                            s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    } else {
-        const sycl::range<3> block_dims(1, 1, 1024);
-        /*
-        DPCT1049:32: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<sycl::float2, 1> s_sum_acc_ct1(
-                sycl::range<1>(32), cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        norm_f32<1024>(x, dst, ncols, eps, item_ct1,
-                                       s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    }
-}
-
-static void group_norm_f32_cuda(const float *x, float *dst,
-                                const int num_groups, const int group_size,
-                                const int ne_elements, dpct::queue_ptr stream) {
-    static const float eps = 1e-6f;
-    if (group_size < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            const float eps_ct4 = eps;
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        group_norm_f32<WARP_SIZE>(
-                            x, dst, group_size, ne_elements, eps_ct4, item_ct1,
-                            s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    } else {
-        const sycl::range<3> block_dims(1, 1, 1024);
-        /*
-        DPCT1049:33: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            const float eps_ct4 = eps;
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, num_groups) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        group_norm_f32<1024>(x, dst, group_size, ne_elements,
-                                             eps_ct4, item_ct1,
-                                             s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    }
-}
-
-static void concat_f32_cuda(const float *x, const float *y, float *dst,
-                            const int ne0, int ne1, int ne2, int ne02,
-                            dpct::queue_ptr stream) {
-    int num_blocks = (ne0 + CUDA_CONCAT_BLOCK_SIZE - 1) / CUDA_CONCAT_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_CONCAT_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            concat_f32(x, y, dst, ne0, ne02, item_ct1);
-        });
-}
-
-static void upscale_f32_cuda(const float *x, float *dst, const int ne00,
-                             const int ne01, const int ne02,
-                             const int scale_factor, dpct::queue_ptr stream) {
-    int ne0 = (ne00 * scale_factor);
-    int num_blocks = (ne0 + CUDA_UPSCALE_BLOCK_SIZE - 1) / CUDA_UPSCALE_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne02, (ne01 * scale_factor), num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim *
-                              sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_UPSCALE_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            upscale_f32(x, dst, ne00, ne00 * ne01, scale_factor, item_ct1);
-        });
-}
-
-static void pad_f32_cuda(const float *x, float *dst, const int ne00,
-                         const int ne01, const int ne02, const int ne0,
-                         const int ne1, const int ne2, dpct::queue_ptr stream) {
-    int num_blocks = (ne0 + CUDA_PAD_BLOCK_SIZE - 1) / CUDA_PAD_BLOCK_SIZE;
-    sycl::range<3> gridDim(ne2, ne1, num_blocks);
-    stream->parallel_for(
-        sycl::nd_range<3>(gridDim * sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_PAD_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            pad_f32(x, dst, ne0, ne00, ne01, ne02, item_ct1);
-        });
-}
-
-static void rms_norm_f32_cuda(const float *x, float *dst, const int ncols,
-                              const int nrows, const float eps,
-                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % WARP_SIZE == 0);
-    if (ncols < 1024) {
-        const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        rms_norm_f32<WARP_SIZE>(x, dst, ncols, eps, item_ct1,
-                                                s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    } else {
-        const sycl::range<3> block_dims(1, 1, 1024);
-        /*
-        DPCT1049:34: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->submit([&](sycl::handler &cgh) {
-            sycl::local_accessor<float, 1> s_sum_acc_ct1(sycl::range<1>(32),
-                                                         cgh);
-
-            cgh.parallel_for(
-                sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims,
-                                  block_dims),
-                [=](sycl::nd_item<3> item_ct1)
-                    [[intel::reqd_sub_group_size(32)]] {
-                        rms_norm_f32<1024>(x, dst, ncols, eps, item_ct1,
-                                           s_sum_acc_ct1.get_pointer());
-                    });
-        });
-    }
-}
-
-static void quantize_row_q8_1_cuda(const float *x, void *vy, const int kx,
-                                   const int ky, const int kx_padded,
-                                   dpct::queue_ptr stream) {
-    const int block_num_x = (kx_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    const sycl::range<3> num_blocks(1, ky, block_num_x);
-    const sycl::range<3> block_size(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(num_blocks * block_size, block_size),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                quantize_q8_1(x, vy, kx, kx_padded, item_ct1);
-            });
-    }
-}
-
-template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
-static void dequantize_block_cuda(const void *__restrict__ vx,
-                                  dst_t *__restrict__ y, const int k,
-                                  dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(
-                sycl::range<3>(1, 1, num_blocks) *
-                    sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE),
-                sycl::range<3>(1, 1, CUDA_DEQUANTIZE_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                dequantize_block<qk, qr, dequantize_kernel>(vx, y, k, item_ct1);
-            });
-    }
-}
-
-template <typename dst_t>
-static void dequantize_row_q2_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q2_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q2_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename dst_t>
-static void dequantize_row_q3_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q3_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q3_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename dst_t>
-static void dequantize_row_q4_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 32),
-                                               sycl::range<3>(1, 1, 32)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q4_K(vx, y, item_ct1);
-                             });
-    }
-}
-
-template <typename dst_t>
-static void dequantize_row_q5_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q5_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q5_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-template <typename dst_t>
-static void dequantize_row_q6_K_cuda(const void *vx, dst_t *y, const int k,
-                                     dpct::queue_ptr stream) {
-    const int nb = k / QK_K;
-#if QK_K == 256
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, nb) *
-                                                   sycl::range<3>(1, 1, 64),
-                                               sycl::range<3>(1, 1, 64)),
-                             [=](sycl::nd_item<3> item_ct1) {
-                                 dequantize_block_q6_K(vx, y, item_ct1);
-                             });
-    }
-#else
-    dequantize_block_q6_K<<<nb, 32, 0, stream>>>(vx, y);
-#endif
-}
-
-static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
-        case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
-        case GGML_TYPE_F32:
-            return dequantize_block_cuda<1, 1, convert_f32>;
-        default:
-            return nullptr;
-    }
-}
-
-static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
-    switch (type) {
-        case GGML_TYPE_Q4_0:
-            return dequantize_block_cuda<QK4_0, QR4_0, dequantize_q4_0>;
-        case GGML_TYPE_Q4_1:
-            return dequantize_block_cuda<QK4_1, QR4_1, dequantize_q4_1>;
-        case GGML_TYPE_Q5_0:
-            return dequantize_block_cuda<QK5_0, QR5_0, dequantize_q5_0>;
-        case GGML_TYPE_Q5_1:
-            return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
-        case GGML_TYPE_Q8_0:
-            return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
-        case GGML_TYPE_Q2_K:
-            return dequantize_row_q2_K_cuda;
-        case GGML_TYPE_Q3_K:
-            return dequantize_row_q3_K_cuda;
-        case GGML_TYPE_Q4_K:
-            return dequantize_row_q4_K_cuda;
-        case GGML_TYPE_Q5_K:
-            return dequantize_row_q5_K_cuda;
-        case GGML_TYPE_Q6_K:
-            return dequantize_row_q6_K_cuda;
-        case GGML_TYPE_F16:
-            return dequantize_block_cuda<1, 1, convert_f16>;
-        default:
-            return nullptr;
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_0_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    // the number of rows may exceed maximum grid size in the y or z dimensions, use the x dimension instead
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK4_0, QR4_0, dequantize_q4_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q4_1_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK4_1, QR4_1, dequantize_q4_1>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q5_0_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK5_0, QR5_0, dequantize_q5_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q5_1_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK5_1, QR5_1, dequantize_q5_1>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q8_0_cuda(const void *vx, const dfloat *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<QK8_0, QR8_0, dequantize_q8_0>(
-                    vx, y, dst, ncols, nrows, item_ct1);
-            });
-    }
-}
-
-static void dequantize_mul_mat_vec_q2_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q3_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q4_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q5_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const sycl::range<3> block_dims(1, 1, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
-        });
-}
-
-static void dequantize_mul_mat_vec_q6_K_cuda(const void *vx, const float *y,
-                                             float *dst, const int ncols,
-                                             const int nrows,
-                                             dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int ny = 2 / K_QUANTS_PER_ITERATION;
-    const int block_num_y = (nrows + ny - 1) / ny;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, ny, 32);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
-        });
-}
-
-static void convert_mul_mat_vec_f16_cuda(const void *vx, const dfloat *y,
-                                         float *dst, const int ncols,
-                                         const int nrows,
-                                         dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % GGML_CUDA_DMMV_X == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                dequantize_mul_mat_vec<1, 1, convert_f16>(vx, y, dst, ncols,
-                                                          nrows, item_ct1);
-            });
-    }
-}
-
-static void mul_mat_vec_q4_0_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK4_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK4_0, QI4_0, block_q4_0, VDR_Q4_0_Q8_1_MMVQ,
-                          vec_dot_q4_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q4_1_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK4_1 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK4_0, QI4_1, block_q4_1, VDR_Q4_1_Q8_1_MMVQ,
-                          vec_dot_q4_1_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q5_0_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK5_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK5_0, QI5_0, block_q5_0, VDR_Q5_0_Q8_1_MMVQ,
-                          vec_dot_q5_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q5_1_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK5_1 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK5_1, QI5_1, block_q5_1, VDR_Q5_1_Q8_1_MMVQ,
-                          vec_dot_q5_1_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q8_0_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK8_0 == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK8_0, QI8_0, block_q8_0, VDR_Q8_0_Q8_1_MMVQ,
-                          vec_dot_q8_0_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q2_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI2_K, block_q2_K, VDR_Q2_K_Q8_1_MMVQ,
-                          vec_dot_q2_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q3_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI3_K, block_q3_K, VDR_Q3_K_Q8_1_MMVQ,
-                          vec_dot_q3_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q4_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI4_K, block_q4_K, VDR_Q4_K_Q8_1_MMVQ,
-                          vec_dot_q4_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q5_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI5_K, block_q5_K, VDR_Q5_K_Q8_1_MMVQ,
-                          vec_dot_q5_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void mul_mat_vec_q6_K_q8_1_cuda(const void *vx, const void *vy,
-                                       float *dst, const int ncols,
-                                       const int nrows,
-                                       dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % QK_K == 0);
-    const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y;
-    const sycl::range<3> block_nums(1, 1, block_num_y);
-    const sycl::range<3> block_dims(1, GGML_CUDA_MMV_Y, WARP_SIZE);
-    stream->parallel_for(
-        sycl::nd_range<3>(block_nums * block_dims, block_dims),
-        [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-            mul_mat_vec_q<QK_K, QI6_K, block_q6_K, VDR_Q6_K_Q8_1_MMVQ,
-                          vec_dot_q6_K_q8_1>(vx, vy, dst, ncols, nrows,
-                                             item_ct1);
-        });
-}
-
-static void ggml_mul_mat_q4_0_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA2;
-        nwarps = NWARPS_Q4_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_0_RDNA1;
-        nwarps = NWARPS_Q4_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_0_AMPERE;
-        nwarps = NWARPS_Q4_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_0_PASCAL;
-        nwarps = NWARPS_Q4_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:35: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:36: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q4_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q4_1_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA2;
-        nwarps = NWARPS_Q4_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_1_RDNA1;
-        nwarps = NWARPS_Q4_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_1_AMPERE;
-        nwarps = NWARPS_Q4_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_1_PASCAL;
-        nwarps = NWARPS_Q4_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:37: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:38: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q4_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q5_0_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA2;
-        nwarps = NWARPS_Q5_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_0_RDNA1;
-        nwarps = NWARPS_Q5_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_0_AMPERE;
-        nwarps = NWARPS_Q5_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_0_PASCAL;
-        nwarps = NWARPS_Q5_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:39: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:40: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q5_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q5_1_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA2;
-        nwarps = NWARPS_Q5_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_1_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_1_RDNA1;
-        nwarps = NWARPS_Q5_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_1_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_1_AMPERE;
-        nwarps = NWARPS_Q5_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_1_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_1_PASCAL;
-        nwarps = NWARPS_Q5_1_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:42: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q5_1<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q8_0_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA2;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA2;
-        nwarps = NWARPS_Q8_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q8_0_RDNA1;
-        mmq_y  =  MMQ_Y_Q8_0_RDNA1;
-        nwarps = NWARPS_Q8_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q8_0_AMPERE;
-        mmq_y  =  MMQ_Y_Q8_0_AMPERE;
-        nwarps = NWARPS_Q8_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q8_0_PASCAL;
-        mmq_y  =  MMQ_Y_Q8_0_PASCAL;
-        nwarps = NWARPS_Q8_0_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:43: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:44: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q8_0<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q2_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA2;
-        nwarps = NWARPS_Q2_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q2_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q2_K_RDNA1;
-        nwarps = NWARPS_Q2_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q2_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q2_K_AMPERE;
-        nwarps = NWARPS_Q2_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q2_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q2_K_PASCAL;
-        nwarps = NWARPS_Q2_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:45: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:46: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q2_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q3_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-#if QK_K == 256
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA2;
-        nwarps = NWARPS_Q3_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q3_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q3_K_RDNA1;
-        nwarps = NWARPS_Q3_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q3_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q3_K_AMPERE;
-        nwarps = NWARPS_Q3_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q3_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q3_K_PASCAL;
-        nwarps = NWARPS_Q3_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:47: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:48: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q3_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-#endif
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q4_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA2;
-        nwarps = NWARPS_Q4_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q4_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q4_K_RDNA1;
-        nwarps = NWARPS_Q4_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q4_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q4_K_AMPERE;
-        nwarps = NWARPS_Q4_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q4_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q4_K_PASCAL;
-        nwarps = NWARPS_Q4_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:49: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:50: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q4_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q5_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA2;
-        nwarps = NWARPS_Q5_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q5_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q5_K_RDNA1;
-        nwarps = NWARPS_Q5_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q5_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q5_K_AMPERE;
-        nwarps = NWARPS_Q5_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q5_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q5_K_PASCAL;
-        nwarps = NWARPS_Q5_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:51: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:52: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q5_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_q6_K_q8_1_cuda(const void *vx, const void *vy,
-                                        float *dst, const int ncols_x,
-                                        const int nrows_x, const int ncols_y,
-                                        const int nrows_y, const int nrows_dst,
-                                        dpct::queue_ptr stream) try {
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-    const int compute_capability = g_compute_capabilities[id];
-
-    int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA2;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA2;
-        nwarps = NWARPS_Q6_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
-        mmq_x  =  MMQ_X_Q6_K_RDNA1;
-        mmq_y  =  MMQ_Y_Q6_K_RDNA1;
-        nwarps = NWARPS_Q6_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
-        mmq_x  =  MMQ_X_Q6_K_AMPERE;
-        mmq_y  =  MMQ_Y_Q6_K_AMPERE;
-        nwarps = NWARPS_Q6_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
-        mmq_x  =  MMQ_X_Q6_K_PASCAL;
-        mmq_y  =  MMQ_Y_Q6_K_PASCAL;
-        nwarps = NWARPS_Q6_K_PASCAL;
-    } else {
-        GGML_ASSERT(false);
-    }
-
-    const int block_num_x = (nrows_x + mmq_y - 1) / mmq_y;
-    const int block_num_y = (ncols_y + mmq_x - 1) / mmq_x;
-    const sycl::range<3> block_nums(1, block_num_y, block_num_x);
-    const sycl::range<3> block_dims(1, nwarps, WARP_SIZE);
-
-    if (nrows_x % mmq_y == 0) {
-        const bool need_check = false;
-        /*
-        DPCT1049:53: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    } else {
-        const bool need_check = true;
-        /*
-        DPCT1049:54: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                mul_mat_q6_K<need_check>(vx, vy, dst, ncols_x, nrows_x, ncols_y,
-                                         nrows_y, nrows_dst);
-            });
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_mul_mat_p021_f16_f32_cuda(const void *vx, const float *y,
-                                           float *dst, const int ncols_x,
-                                           const int nrows_x,
-                                           const int nchannels_x,
-                                           const int nchannels_y,
-                                           dpct::queue_ptr stream) {
-
-    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
-    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
-                                     nchannels_y, item_ct1);
-            });
-    }
-}
-
-static void ggml_mul_mat_vec_nc_f16_f32_cuda(
-    const void *vx, const float *y, float *dst, const int ncols_x,
-    const int nrows_x, const int row_stride_x, const int nchannels_x,
-    const int nchannels_y, const int channel_stride_x, dpct::queue_ptr stream) {
-
-    const sycl::range<3> block_nums(nchannels_y, nrows_x, 1);
-    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
-                                       row_stride_x, channel_stride_x,
-                                       nchannels_y / nchannels_x, item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_f32_f32_cuda(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int nb00, const int nb01,
-                                  const int nb02, const int ne10,
-                                  const int ne11, const int nb10,
-                                  const int nb11, const int nb12,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_f32_f32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
-                                           nb02, ne10, ne11, nb10, nb11, nb12,
-                                           item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_f32_f16_cuda(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int nb00, const int nb01,
-                                  const int nb02, const int ne10,
-                                  const int ne11, const int nb10,
-                                  const int nb11, const int nb12,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_f32_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
-                                           nb02, ne10, ne11, nb10, nb11, nb12,
-                                           item_ct1);
-            });
-    }
-}
-
-static void ggml_cpy_f32_q8_0_cuda(const char *cx, char *cdst, const int ne,
-                                   const int ne00, const int ne01,
-                                   const int nb00, const int nb01,
-                                   const int nb02, const int ne10,
-                                   const int ne11, const int nb10,
-                                   const int nb11, const int nb12,
-                                   dpct::queue_ptr stream) {
-
-    GGML_ASSERT(ne % QK8_0 == 0);
-    const int num_blocks = ne / QK8_0;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
-                                           sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>(
-                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
-                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
-                         });
-}
-
-static void ggml_cpy_f32_q4_0_cuda(const char *cx, char *cdst, const int ne,
-                                   const int ne00, const int ne01,
-                                   const int nb00, const int nb01,
-                                   const int nb02, const int ne10,
-                                   const int ne11, const int nb10,
-                                   const int nb11, const int nb12,
-                                   dpct::queue_ptr stream) {
-
-    GGML_ASSERT(ne % QK4_0 == 0);
-    const int num_blocks = ne / QK4_0;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
-                                           sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>(
-                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
-                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
-                         });
-}
-
-static void ggml_cpy_f32_q4_1_cuda(const char *cx, char *cdst, const int ne,
-                                   const int ne00, const int ne01,
-                                   const int nb00, const int nb01,
-                                   const int nb02, const int ne10,
-                                   const int ne11, const int nb10,
-                                   const int nb11, const int nb12,
-                                   dpct::queue_ptr stream) {
-
-    GGML_ASSERT(ne % QK4_1 == 0);
-    const int num_blocks = ne / QK4_1;
-    stream->parallel_for(sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks),
-                                           sycl::range<3>(1, 1, 1)),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>(
-                                 cx, cdst, ne, ne00, ne01, nb00, nb01, nb02,
-                                 ne10, ne11, nb10, nb11, nb12, item_ct1);
-                         });
-}
-
-static void ggml_cpy_f16_f16_cuda(const char *cx, char *cdst, const int ne,
-                                  const int ne00, const int ne01,
-                                  const int nb00, const int nb01,
-                                  const int nb02, const int ne10,
-                                  const int ne11, const int nb10,
-                                  const int nb11, const int nb12,
-                                  dpct::queue_ptr stream) {
-
-    const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE;
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                                  sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_CPY_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                cpy_f32_f16<cpy_1_f16_f16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
-                                           nb02, ne10, ne11, nb10, nb11, nb12,
-                                           item_ct1);
-            });
-    }
-}
-
-static void scale_f32_cuda(const float *x, float *dst, const float scale,
-                           const int k, dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_SCALE_BLOCK_SIZE - 1) / CUDA_SCALE_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_SCALE_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            scale_f32(x, dst, scale, k, item_ct1);
-        });
-}
-
-static void clamp_f32_cuda(const float *x, float *dst, const float min,
-                           const float max, const int k,
-                           dpct::queue_ptr stream) {
-    const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
-    stream->parallel_for(
-        sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
-                              sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE),
-                          sycl::range<3>(1, 1, CUDA_CLAMP_BLOCK_SIZE)),
-        [=](sycl::nd_item<3> item_ct1) {
-            clamp_f32(x, dst, min, max, k, item_ct1);
-        });
-}
-
-template <typename T>
-static void rope_cuda(const T *x, T *dst, int ncols, int nrows,
-                      const int32_t *pos, float freq_scale, int p_delta_rows,
-                      float freq_base, float ext_factor, float attn_factor,
-                      rope_corr_dims corr_dims, dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
-    if (pos == nullptr) {
-        /*
-        DPCT1049:55: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope<T, false>(x, dst, ncols, pos, freq_scale, p_delta_rows,
-                               freq_base, ext_factor, attn_factor, corr_dims,
-                               item_ct1);
-            });
-    } else {
-        /*
-        DPCT1049:56: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope<T, true>(x, dst, ncols, pos, freq_scale, p_delta_rows,
-                              freq_base, ext_factor, attn_factor, corr_dims,
-                              item_ct1);
-            });
-    }
-}
-
-template <typename T>
-static void rope_neox_cuda(const T *x, T *dst, int ncols, int n_dims, int nrows,
-                           const int32_t *pos, float freq_scale,
-                           int p_delta_rows, float freq_base, float ext_factor,
-                           float attn_factor, rope_corr_dims corr_dims,
-                           dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 2 == 0);
-    const sycl::range<3> block_dims(1, CUDA_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ncols + 2*CUDA_ROPE_BLOCK_SIZE - 1) / (2*CUDA_ROPE_BLOCK_SIZE);
-    const sycl::range<3> block_nums(1, num_blocks_x, nrows);
-
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
-    const float inv_ndims = -1.0f / n_dims;
-
-    if (pos == nullptr) {
-        /*
-        DPCT1049:57: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_neox<T, false>(x, dst, ncols, n_dims, pos, freq_scale,
-                                    p_delta_rows, ext_factor, attn_factor,
-                                    corr_dims, theta_scale, inv_ndims,
-                                    item_ct1);
-            });
-    } else {
-        /*
-        DPCT1049:58: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_neox<T, true>(x, dst, ncols, n_dims, pos, freq_scale,
-                                   p_delta_rows, ext_factor, attn_factor,
-                                   corr_dims, theta_scale, inv_ndims, item_ct1);
-            });
-    }
-}
-
-static void rope_glm_f32_cuda(const float *x, float *dst, int ncols, int nrows,
-                              const int32_t *pos, float freq_scale,
-                              int p_delta_rows, float freq_base, int n_ctx,
-                              dpct::queue_ptr stream) {
-    GGML_ASSERT(ncols % 4 == 0);
-    const sycl::range<3> block_dims(1, 1, CUDA_ROPE_BLOCK_SIZE / 4);
-    const int num_blocks_x = (ncols + CUDA_ROPE_BLOCK_SIZE - 1) / CUDA_ROPE_BLOCK_SIZE;
-    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             rope_glm_f32(x, dst, ncols, pos, freq_scale,
-                                          p_delta_rows, freq_base, n_ctx,
-                                          item_ct1);
-                         });
-}
-
-static void alibi_f32_cuda(const float *x, float *dst, const int ncols,
-                           const int nrows, const int k_rows,
-                           const int n_heads_log2_floor, const float m0,
-                           const float m1, dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, 1, CUDA_ALIBI_BLOCK_SIZE);
-    const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
-    const sycl::range<3> block_nums(1, nrows, num_blocks_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             alibi_f32(x, dst, ncols, k_rows,
-                                       n_heads_log2_floor, m0, m1, item_ct1);
-                         });
-}
-
-static void sum_rows_f32_cuda(const float *x, float *dst, const int ncols,
-                              const int nrows, dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, 1, WARP_SIZE);
-    const sycl::range<3> block_nums(1, nrows, 1);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1)
-                             [[intel::reqd_sub_group_size(32)]] {
-                                 k_sum_rows_f32(x, dst, ncols, item_ct1);
-                             });
-}
-
-static void argsort_f32_i32_cuda(const float *x, int *dst, const int ncols,
-                                 const int nrows, ggml_sort_order order,
-                                 dpct::queue_ptr stream) {
-    // bitonic sort requires ncols to be power of 2
-    GGML_ASSERT((ncols & (ncols - 1)) == 0);
-
-    const sycl::range<3> block_dims(1, 1, ncols);
-    const sycl::range<3> block_nums(1, nrows, 1);
-    if (order == GGML_SORT_ASC) {
-        /*
-        DPCT1049:59: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_argsort_f32_i32<GGML_SORT_ASC>(x, dst, ncols, item_ct1);
-            });
-    } else if (order == GGML_SORT_DESC) {
-        /*
-        DPCT1049:60: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                k_argsort_f32_i32<GGML_SORT_DESC>(x, dst, ncols, item_ct1);
-            });
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-static void diag_mask_inf_f32_cuda(const float *x, float *dst,
-                                   const int ncols_x, const int nrows_x,
-                                   const int rows_per_channel, const int n_past,
-                                   dpct::queue_ptr stream) {
-    const sycl::range<3> block_dims(1, CUDA_DIAG_MASK_INF_BLOCK_SIZE, 1);
-    const int block_num_x = (ncols_x + CUDA_DIAG_MASK_INF_BLOCK_SIZE - 1) / CUDA_DIAG_MASK_INF_BLOCK_SIZE;
-    const sycl::range<3> block_nums(1, block_num_x, nrows_x);
-    stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
-                         [=](sycl::nd_item<3> item_ct1) {
-                             diag_mask_inf_f32(x, dst, ncols_x,
-                                               rows_per_channel, n_past,
-                                               item_ct1);
-                         });
-}
-
-static void soft_max_f32_cuda(const float *x, const float *y, float *dst,
-                              const int ncols_x, const int nrows_x,
-                              const int nrows_y, const float scale,
-                              dpct::queue_ptr stream) {
-    int nth = WARP_SIZE;
-    while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
-    const sycl::range<3> block_dims(1, 1, nth);
-    const sycl::range<3> block_nums(1, 1, nrows_x);
-    /*
-    DPCT1049:61: The work-group size passed to the SYCL kernel may exceed the
-    limit. To get the device limit, query info::device::max_work_group_size.
-    Adjust the work-group size if needed.
-    */
-    stream->submit([&](sycl::handler &cgh) {
-        /*
-        DPCT1101:125: 'CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE' expression was
-        replaced with a value. Modify the code to use the original expression,
-        provided in comments, if it is correct.
-        */
-        sycl::local_accessor<float, 1> buf_acc_ct1(
-            sycl::range<1>(32 /*CUDA_SOFT_MAX_BLOCK_SIZE/WARP_SIZE*/), cgh);
-
-        cgh.parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
-                soft_max_f32(x, y, dst, ncols_x, nrows_y, scale, item_ct1,
-                             buf_acc_ct1.get_pointer());
-            });
-    });
-}
-
-static void im2col_f32_f16_cuda(const float *x, sycl::half *dst, int IW, int IH,
-                                int OW, int OH, int KW, int KH, int IC,
-                                int offset_delta, int s0, int s1, int p0,
-                                int p1, int d0, int d1,
-                                dpct::queue_ptr stream) {
-    const int parallel_elements = OW * KW * KH;
-    const int num_blocks = (parallel_elements + CUDA_IM2COL_BLOCK_SIZE - 1) / CUDA_IM2COL_BLOCK_SIZE;
-    sycl::range<3> block_nums(IC, OH, num_blocks);
-    {
-        dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums *
-                                  sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE),
-                              sycl::range<3>(1, 1, CUDA_IM2COL_BLOCK_SIZE)),
-            [=](sycl::nd_item<3> item_ct1) {
-                im2col_f32_f16(x, dst, offset_delta, IW, IH, OW, KW, KH,
-                               parallel_elements, (IC * KH * KW), s0, s1, p0,
-                               p1, d0, d1, item_ct1);
-            });
-    }
-}
-
-// buffer pool for cuda
-#define MAX_CUDA_BUFFERS 256
-
-struct scoped_spin_lock {
-    std::atomic_flag& lock;
-    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
-        while (lock.test_and_set(std::memory_order_acquire)) {
-            ; // spin
-        }
-    }
-    ~scoped_spin_lock() {
-        lock.clear(std::memory_order_release);
-    }
-    scoped_spin_lock(const scoped_spin_lock&) = delete;
-    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
-};
-
-struct cuda_buffer {
-    void * ptr = nullptr;
-    size_t size = 0;
-};
-
-static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
-static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
-
-static void *ggml_cuda_pool_malloc(size_t size, size_t *actual_size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-#ifdef DEBUG_CUDA_MALLOC
-    int nnz = 0;
-    size_t max_size = 0, tot_size = 0;
-#endif
-    size_t best_diff = 1ull << 36;
-    int ibest = -1;
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][i];
-        if (b.ptr != nullptr) {
-#ifdef DEBUG_CUDA_MALLOC
-            ++nnz;
-            tot_size += b.size;
-            if (b.size > max_size) max_size = b.size;
-#endif
-            if (b.size >= size) {
-                size_t diff = b.size - size;
-                if (diff < best_diff) {
-                    best_diff = diff;
-                    ibest = i;
-                    if (!best_diff) {
-                        void * ptr = b.ptr;
-                        *actual_size = b.size;
-                        b.ptr = nullptr;
-                        b.size = 0;
-                        return ptr;
-                    }
-                }
-            }
-        }
-    }
-    if (ibest >= 0) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
-        void * ptr = b.ptr;
-        *actual_size = b.size;
-        b.ptr = nullptr;
-        b.size = 0;
-        return ptr;
-    }
-#ifdef DEBUG_CUDA_MALLOC
-    fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
-            (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
-#endif
-    void * ptr;
-    size_t look_ahead_size = (size_t) (1.05 * size);
-    look_ahead_size = 256 * ((look_ahead_size + 255)/256);
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(ptr = (void *)sycl::malloc_device(
-                             look_ahead_size, dpct::get_in_order_queue())));
-    *actual_size = look_ahead_size;
-    return ptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_pool_free(void *ptr, size_t size) try {
-    scoped_spin_lock lock(g_cuda_pool_lock);
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-
-    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][i];
-        if (b.ptr == nullptr) {
-            b.ptr = ptr;
-            b.size = size;
-            return;
-        }
-    }
-    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
-    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static bool g_cublas_loaded = false;
-
-bool ggml_cublas_loaded(void) {
-    return g_cublas_loaded;
-}
-
-void print_devices(int device_count){
-    for (int id = 0; id < device_count; ++id) {
-        dpct::device_info prop;
-        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(id))));
-
-        fprintf(stderr, "  Device %d: %s, compute capability %d.%d\n", id,
-                prop.get_name(), prop.get_major_version(),
-                prop.get_minor_version());
-    }
-
-}
-
-void ggml_init_cublas() try {
-    static bool initialized = false;
-
-    if (!initialized) {
-
-#ifdef __HIP_PLATFORM_AMD__
-        // Workaround for a rocBLAS bug when using multiple graphics cards:
-        // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346
-        rocblas_initialize();
-        CUDA_CHECK(cudaDeviceSynchronize());
-#endif
-
-        g_device_count = dpct::dev_mgr::instance().device_count();
-        if (DPCT_CHECK_ERROR(g_device_count != 0)) {
-            initialized = true;
-            g_cublas_loaded = false;
-            return;
-        }
-
-        GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
-        int64_t total_vram = 0;
-#if defined(GGML_CUDA_FORCE_MMQ)
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   yes\n", __func__);
-#else
-        fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ:   no\n", __func__);
-#endif
-#if defined(CUDA_USE_TENSOR_CORES)
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
-#else
-        fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
-#endif
-        fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
-        print_devices(g_device_count);
-
-        char * user_device_string = getenv("GGML_SYCL_DEVICE");
-        int user_device_number = -1;
-
-        unsigned n;
-        if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1 && n < g_device_count) {
-            user_device_number = (int)n;
-        } else {
-            user_device_number=0;
-        }
-
-        //zjy hardcode, force set to 1 device
-        int id = 0;
-        g_tensor_split[id] = total_vram;
-        dpct::device_info prop;
-        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(user_device_number))));
-
-        total_vram += prop.get_global_mem_size();
-        g_compute_capabilities[id] =
-                100 * prop.get_major_version() + 10 * prop.get_minor_version();
-
-        g_device_count = 1;
-        for (int id = 0; id < g_device_count; ++id) {
-            g_tensor_split[id] /= total_vram;
-        }
-
-        for (int id = 0; id < g_device_count; ++id) {
-            CUDA_CHECK(ggml_cuda_set_device(user_device_number));
-
-            // create cuda streams
-            for (int is = 0; is < MAX_STREAMS; ++is) {
-                /*
-                DPCT1025:105: The SYCL queue is created ignoring the flag and
-                priority options.
-                */
-                CUDA_CHECK(DPCT_CHECK_ERROR(
-                    g_cudaStreams[id][is] =
-                        dpct::get_current_device().create_queue()));
-            }
-
-            // create cublas handle
-            CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] =
-                                              &dpct::get_in_order_queue()));
-            /*
-            DPCT1027:107: The call to cublasSetMathMode was replaced with 0
-            because this call is redundant in SYCL.
-            */
-            CUBLAS_CHECK(0);
-        }
-
-        // configure logging to stdout
-        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
-
-
-        ggml_cuda_set_device(user_device_number);
-        fprintf(stderr, "  set Device %d\n", user_device_number);
-
-        initialized = true;
-        g_cublas_loaded = true;
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_set_tensor_split(const float * tensor_split) {
-    if (tensor_split == nullptr) {
-        return;
-    }
-    bool all_zero = true;
-    for (int i = 0; i < g_device_count; ++i) {
-        if (tensor_split[i] != 0.0f) {
-            all_zero = false;
-            break;
-        }
-    }
-    if (all_zero) {
-        return;
-    }
-    float split_sum = 0.0f;
-    for (int i = 0; i < g_device_count; ++i) {
-        g_tensor_split[i] = split_sum;
-        split_sum += tensor_split[i];
-    }
-    for (int i = 0; i < g_device_count; ++i) {
-        g_tensor_split[i] /= split_sum;
-    }
-}
-
-void *ggml_cuda_host_malloc(size_t size) try {
-    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
-        return nullptr;
-    }
-
-    void * ptr = nullptr;
-    dpct::err0 err = DPCT_CHECK_ERROR(
-        ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue()));
-    /*
-    DPCT1000:109: Error handling if-stmt was detected but could not be
-    rewritten.
-    */
-    if (err != 0) {
-        // The allocation error can be bypassed. A null ptr will assigned out of this function.
-        // This can fixed the OOM error in WSL.
-        /*
-        DPCT1026:110: The call to cudaGetLastError was removed because this call
-        is redundant in SYCL.
-        */
-        /*
-        DPCT1001:108: The statement could not be removed.
-        */
-        fprintf(
-            stderr,
-            "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
-            /*
-            DPCT1009:111: SYCL uses exceptions to report errors and does not use
-            the error codes. The original code was commented out and a warning
-            string was inserted. You need to rewrite this code.
-            */
-            size / 1024.0 / 1024.0,
-            "cudaGetErrorString is not supported" /*cudaGetErrorString(err)*/);
-        return nullptr;
-    }
-
-    return ptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_host_free(void *ptr) try {
-    CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(ptr, dpct::get_in_order_queue())));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static dpct::err0 ggml_cuda_cpy_tensor_2d(void *dst,
-                                          const struct ggml_tensor *src,
-                                          int64_t i3, int64_t i2,
-                                          int64_t i1_low, int64_t i1_high,
-                                          dpct::queue_ptr stream) try {
-
-    dpct::memcpy_direction kind;
-    char * src_ptr;
-    if (src->backend == GGML_BACKEND_CPU) {
-        kind = dpct::host_to_device;
-        src_ptr = (char *) src->data;
-    } else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
-        GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
-        kind = dpct::device_to_device;
-        ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
-        int id;
-        id = dpct::dev_mgr::instance().current_device_id();
-        // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-        src_ptr = (char *) extra->data_device[id];
-    } else {
-        GGML_ASSERT(false);
-    }
-    char * dst_ptr = (char *) dst;
-
-    const int64_t ne0 = src->ne[0];
-    const int64_t nb0 = src->nb[0];
-    const int64_t nb1 = src->nb[1];
-    const int64_t nb2 = src->nb[2];
-    const int64_t nb3 = src->nb[3];
-    const enum ggml_type type = src->type;
-    const int64_t ts = ggml_type_size(type);
-    const int64_t bs = ggml_blck_size(type);
-    int64_t i1_diff = i1_high - i1_low;
-
-    const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3;
-    if (nb0 == ts && nb1 == ts*ne0/bs) {
-        return DPCT_CHECK_ERROR(stream->memcpy(dst_ptr, x, i1_diff * nb1));
-    } else if (nb0 == ts) {
-        return DPCT_CHECK_ERROR(
-            dpct::async_dpct_memcpy(dst_ptr, ts * ne0 / bs, x, nb1,
-                                    ts * ne0 / bs, i1_diff, kind, *stream));
-    } else {
-        for (int64_t i1 = 0; i1 < i1_diff; i1++) {
-            const void * rx = (const void *) ((const char *) x + i1*nb1);
-            void * rd = (void *) (dst_ptr + i1*ts*ne0/bs);
-            // pretend the row is a matrix with cols=1
-            dpct::err0 r = DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
-                rd, ts / bs, rx, nb0, ts / bs, ne0, kind, *stream));
-            /*
-            DPCT1001:112: The statement could not be removed.
-            */
-            /*
-            DPCT1000:113: Error handling if-stmt was detected but could not be
-            rewritten.
-            */
-            if (r != 0) return r;
-        }
-        return 0;
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_op_get_rows(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_d, const float *src1_d,
-                                  float *dst_d, const dpct::queue_ptr &stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_I32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(src0->nb[0] == ggml_type_size(src0->type));
-    GGML_ASSERT(src1->nb[0] == ggml_type_size(src1->type));
-    GGML_ASSERT(dst->nb[0] == ggml_type_size(dst->type));
-
-    const int32_t * src1_i32 = (const int32_t *) src1_d;
-
-    switch (src0->type) {
-        case GGML_TYPE_F16:
-            get_rows_cuda_float(src0, src1, dst, (const sycl::half *)src0_d,
-                                src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_F32:
-            get_rows_cuda_float(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_0:
-            get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0, src1, dst, src0_d, src1_i32, dst_d, stream);
-            break;
-        default:
-            // TODO: k-quants
-            GGML_ASSERT(false);
-            break;
-    }
-}
-
-template <class op>
-inline void ggml_cuda_op_bin_bcast(const ggml_tensor *src0,
-                                   const ggml_tensor *src1, ggml_tensor *dst,
-                                   const float *src0_dd, const float *src1_dd,
-                                   float *dst_dd,
-                                   const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd,
-             (sycl::half *)dst_dd, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
-        op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
-             main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
-            ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-}
-
-static void ggml_cuda_op_repeat(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_d, const float *src1_d,
-                                float *dst_d,
-                                const dpct::queue_ptr &main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_repeat>>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream);
-
-    (void) src1;
-    (void) src1_d;
-}
-
-inline void ggml_cuda_op_add(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_cuda_op_acc(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->ne[3] == 1); // just 3D tensors supported
-
-    int nb1 = dst->op_params[0] / 4; // 4 bytes of float32
-    int nb2 = dst->op_params[1] / 4; // 4 bytes of float32
-    // int nb3 = dst->op_params[2] / 4; // 4 bytes of float32 - unused
-    int offset = dst->op_params[3] / 4; // offset in bytes
-
-    acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream);
-
-    (void) dst;
-}
-
-inline void ggml_cuda_op_mul(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_cuda_op_div(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_div>>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
-}
-
-inline void ggml_cuda_op_gelu(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_silu(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_gelu_quick(const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_relu(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_leaky_relu(const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float negative_slope;
-    memcpy(&negative_slope, dst->op_params, sizeof(float));
-
-    leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_sqr(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_norm(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_group_norm(const ggml_tensor *src0,
-                                    const ggml_tensor *src1, ggml_tensor *dst,
-                                    const float *src0_dd, const float *src1_dd,
-                                    float *dst_dd,
-                                    const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    int num_groups = dst->op_params[0];
-    int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups);
-    group_norm_f32_cuda(src0_dd, dst_dd, num_groups, group_size, src0->ne[0] * src0->ne[1] * src0->ne[2], main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_concat(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_dd, const float *src1_dd,
-                                float *dst_dd,
-                                const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-
-    for (int i3 = 0; i3 < dst->ne[3]; i3++) {
-        concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream);
-    }
-
-    (void) src1;
-    (void) dst;
-}
-
-inline void ggml_cuda_op_upscale(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const float *src0_dd, const float *src1_dd,
-                                 float *dst_dd,
-                                 const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    const int scale_factor = dst->op_params[0];
-
-    upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], scale_factor, main_stream);
-
-    (void) src1;
-    (void) dst;
-}
-
-inline void ggml_cuda_op_pad(const ggml_tensor *src0, const ggml_tensor *src1,
-                             ggml_tensor *dst, const float *src0_dd,
-                             const float *src1_dd, float *dst_dd,
-                             const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type == GGML_TYPE_F32);
-    GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors
-
-    pad_f32_cuda(src0_dd, dst_dd,
-        src0->ne[0], src0->ne[1], src0->ne[2],
-        dst->ne[0], dst->ne[1], dst->ne[2], main_stream);
-
-    (void) src1;
-    (void) dst;
-}
-
-inline void ggml_cuda_op_rms_norm(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int ne00 = src0->ne[0];
-    const int nrows = ggml_nrows(src0);
-
-    float eps;
-    memcpy(&eps, dst->op_params, sizeof(float));
-
-    rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_mul_mat_q(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) try {
-
-    const int64_t ne00 = src0->ne[0];
-
-    const int64_t ne10 = src1->ne[0];
-    GGML_ASSERT(ne10 % QK8_1 == 0);
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into
-    const int64_t nrows_dst = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            ggml_mul_mat_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            ggml_mul_mat_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            ggml_mul_mat_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            ggml_mul_mat_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            ggml_mul_mat_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            ggml_mul_mat_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            ggml_mul_mat_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            ggml_mul_mat_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            ggml_mul_mat_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            ggml_mul_mat_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_ncols, src1_padded_row_size, nrows_dst, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static int64_t get_row_rounding(ggml_type type) {
-    int64_t min_compute_capability = INT_MAX;
-    int64_t max_compute_capability = INT_MIN;
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
-            if (min_compute_capability > g_compute_capabilities[id]) {
-                min_compute_capability = g_compute_capabilities[id];
-            }
-            if (max_compute_capability < g_compute_capabilities[id]) {
-                max_compute_capability = g_compute_capabilities[id];
-            }
-        }
-    }
-
-#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 32;
-        case GGML_TYPE_Q3_K:
-            return min_compute_capability < CC_RDNA2 ? 128 : 64;
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-            return max_compute_capability >= CC_RDNA2 ? 128 : 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#else
-    switch(type) {
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-            return 64;
-        case GGML_TYPE_F16:
-        case GGML_TYPE_F32:
-            return 1;
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
-        case GGML_TYPE_Q6_K:
-            return 64;
-        default:
-            GGML_ASSERT(false);
-    }
-#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-}
-
-inline void ggml_cuda_op_mul_mat_vec_q(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) {
-
-    GGML_ASSERT(ggml_nrows(src1) == 1);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddf_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-inline void ggml_cuda_op_dequantize_mul_mat_vec(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    // on some GPUs it is faster to convert src1 to half and to use half precision intrinsics
-#ifdef GGML_CUDA_F16
-    size_t ash;
-    dfloat * src1_dfloat = nullptr; // dfloat == half
-
-    bool src1_convert_f16 =
-        src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1 ||
-        src0->type == GGML_TYPE_Q5_0 || src0->type == GGML_TYPE_Q5_1 ||
-        src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
-
-    if (src1_convert_f16) {
-        src1_dfloat = (half *) ggml_cuda_pool_malloc(ne00*sizeof(half), &ash);
-        ggml_cpy_f32_f16_cuda((const char *) src1_ddf_i, (char *) src1_dfloat, ne00,
-                                ne00, 1, sizeof(float), 0, 0,
-                                ne00, 1, sizeof(half),  0, 0, stream);
-    }
-#else
-    const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion
-#endif // GGML_CUDA_F16
-
-    switch (src0->type) {
-        case GGML_TYPE_Q4_0:
-            dequantize_mul_mat_vec_q4_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_1:
-            dequantize_mul_mat_vec_q4_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_0:
-            dequantize_mul_mat_vec_q5_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_1:
-            dequantize_mul_mat_vec_q5_1_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q8_0:
-            dequantize_mul_mat_vec_q8_0_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q2_K:
-            dequantize_mul_mat_vec_q2_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q3_K:
-            dequantize_mul_mat_vec_q3_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q4_K:
-            dequantize_mul_mat_vec_q4_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q5_K:
-            dequantize_mul_mat_vec_q5_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_Q6_K:
-            dequantize_mul_mat_vec_q6_K_cuda(src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stream);
-            break;
-        case GGML_TYPE_F16:
-            convert_mul_mat_vec_f16_cuda(src0_dd_i, src1_dfloat, dst_dd_i, ne00, row_diff, stream);
-            break;
-        default:
-            GGML_ASSERT(false);
-            break;
-    }
-
-#ifdef GGML_CUDA_F16
-    if (src1_convert_f16) {
-        ggml_cuda_pool_free(src1_dfloat, ash);
-    }
-#endif // GGML_CUDA_F16
-
-    (void) src1;
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_ncols;
-    (void) src1_padded_row_size;
-}
-
-inline void ggml_cuda_op_mul_mat_cublas(
-    const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst,
-    const char *src0_dd_i, const float *src1_ddf_i, const char *src1_ddq_i,
-    float *dst_dd_i, const int64_t row_low, const int64_t row_high,
-    const int64_t src1_ncols, const int64_t src1_padded_row_size,
-    const dpct::queue_ptr &stream) try {
-
-    GGML_ASSERT(src0_dd_i  != nullptr);
-    GGML_ASSERT(src1_ddf_i != nullptr);
-    GGML_ASSERT(dst_dd_i   != nullptr);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-
-    const int64_t row_diff = row_high - row_low;
-
-    int id;
-    id = dpct::dev_mgr::instance().current_device_id();
-    // CUDA_CHECK(id = dpct::dev_mgr::instance().current_device_id());
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // ldc == nrows of the matrix that cuBLAS writes into
-    int ldc = dst->backend == GGML_BACKEND_GPU && id == g_main_device ? ne0 : row_diff;
-
-    const int compute_capability = g_compute_capabilities[id];
-
-    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1]) {
-        // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
-        sycl::half *src0_as_f16 = nullptr;
-        size_t src0_as = 0;
-        if (src0->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
-            size_t ne = row_diff*ne00;
-            src0_as_f16 = (sycl::half *)ggml_cuda_pool_malloc(
-                ne * sizeof(sycl::half), &src0_as);
-            to_fp16_cuda(src0_dd_i, src0_as_f16, ne, stream);
-        }
-        const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
-                                         ? (const sycl::half *)src0_dd_i
-                                         : src0_as_f16;
-
-        sycl::half *src1_as_f16 = nullptr;
-        size_t src1_as = 0;
-        if (src1->type != GGML_TYPE_F16) {
-            const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-            GGML_ASSERT(to_fp16_cuda != nullptr);
-            size_t ne = src1_ncols*ne10;
-            src1_as_f16 = (sycl::half *)ggml_cuda_pool_malloc(
-                ne * sizeof(sycl::half), &src1_as);
-            to_fp16_cuda(src1_ddf_i, src1_as_f16, ne, stream);
-        }
-        const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
-                                         ? (const sycl::half *)src1_ddf_i
-                                         : src1_as_f16;
-        size_t dst_as = 0;
-        sycl::half *dst_f16 = (sycl::half *)ggml_cuda_pool_malloc(
-            row_diff * src1_ncols * sizeof(sycl::half), &dst_as);
-
-        const sycl::half alpha_f16 = 1.0f;
-        const sycl::half beta_f16 = 0.0f;
-
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm(
-            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
-            &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
-            src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16, dst_f16,
-            dpct::library_data_t::real_half, ldc,
-            dpct::library_data_t::real_half)));
-
-        const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-        to_fp32_cuda(dst_f16, dst_dd_i, row_diff*src1_ncols, stream);
-
-        ggml_cuda_pool_free(dst_f16, dst_as);
-
-        if (src0_as != 0) {
-            ggml_cuda_pool_free(src0_as_f16, src0_as);
-        }
-
-        if (src1_as != 0) {
-            ggml_cuda_pool_free(src1_as_f16, src1_as);
-        }
-    }
-    else {
-        float * src0_ddq_as_f32 = nullptr;
-        size_t src0_as = 0;
-
-        if (src0->type != GGML_TYPE_F32) {
-            const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type);
-            GGML_ASSERT(to_fp32_cuda != nullptr);
-            src0_ddq_as_f32 = (float *) ggml_cuda_pool_malloc(row_diff*ne00 * sizeof(float), &src0_as); // NOLINT
-            to_fp32_cuda(src0_dd_i, src0_ddq_as_f32, row_diff*ne00, stream);
-        }
-        const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32;
-
-        const float alpha = 1.0f;
-        const float beta = 0.0f;
-
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(g_cublas_handles[id] = stream));
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(oneapi::mkl::blas::column_major::gemm(
-            *g_cublas_handles[id], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, alpha,
-            src0_ddf_i, ne00, src1_ddf_i, ne10, beta, dst_dd_i, ldc)));
-
-        if (src0_as != 0) {
-            ggml_cuda_pool_free(src0_ddq_as_f32, src0_as);
-        }
-    }
-
-    (void) dst;
-    (void) src1_ddq_i;
-    (void) src1_padded_row_size;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-inline void ggml_cuda_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
-                              ggml_tensor *dst, const float *src0_dd,
-                              const float *src1_dd, float *dst_dd,
-                              const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
-    GGML_ASSERT(src0->type == dst->type);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne2 = dst->ne[2];
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past      = ((int32_t *) dst->op_params)[0];
-    const int n_dims      = ((int32_t *) dst->op_params)[1];
-    const int mode        = ((int32_t *) dst->op_params)[2];
-    const int n_ctx       = ((int32_t *) dst->op_params)[3];
-    const int n_orig_ctx  = ((int32_t *) dst->op_params)[4];
-
-    // RoPE alteration for extended context
-    float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
-    memcpy(&freq_base,   (int32_t *) dst->op_params +  5, sizeof(float));
-    memcpy(&freq_scale,  (int32_t *) dst->op_params +  6, sizeof(float));
-    memcpy(&ext_factor,  (int32_t *) dst->op_params +  7, sizeof(float));
-    memcpy(&attn_factor, (int32_t *) dst->op_params +  8, sizeof(float));
-    memcpy(&beta_fast,   (int32_t *) dst->op_params +  9, sizeof(float));
-    memcpy(&beta_slow,   (int32_t *) dst->op_params + 10, sizeof(float));
-
-    const int32_t * pos = nullptr;
-    if ((mode & 1) == 0) {
-        GGML_ASSERT(src1->type == GGML_TYPE_I32);
-        GGML_ASSERT(src1->ne[0] == ne2);
-        pos = (const int32_t *) src1_dd;
-    }
-
-    const bool is_neox = mode & 2;
-    const bool is_glm  = mode & 4;
-
-    rope_corr_dims corr_dims;
-    ggml_rope_yarn_corr_dims(n_dims, n_orig_ctx, freq_base, beta_fast, beta_slow, corr_dims.v);
-
-    // compute
-    if (is_glm) {
-        GGML_ASSERT(false);
-        rope_glm_f32_cuda(src0_dd, dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, n_ctx, main_stream);
-    } else if (is_neox) {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_neox_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, n_dims, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_neox_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd,
-                           ne00, n_dims, nrows, pos, freq_scale, ne01,
-                           freq_base, ext_factor, attn_factor, corr_dims,
-                           main_stream);
-        } else {
-            GGML_ASSERT(false);
-        }
-    } else {
-        if (src0->type == GGML_TYPE_F32) {
-            rope_cuda(
-                (const float *)src0_dd, (float *)dst_dd, ne00, nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, main_stream
-            );
-        } else if (src0->type == GGML_TYPE_F16) {
-            rope_cuda((const sycl::half *)src0_dd, (sycl::half *)dst_dd, ne00,
-                      nrows, pos, freq_scale, ne01, freq_base, ext_factor,
-                      attn_factor, corr_dims, main_stream);
-        } else {
-            GGML_ASSERT(false);
-        }
-    }
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
-                               ggml_tensor *dst, const float *src0_dd,
-                               const float *src1_dd, float *dst_dd,
-                               const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t nrows = ggml_nrows(src0);
-
-    //const int n_past = ((int32_t *) dst->op_params)[0];
-    const int n_head = ((int32_t *) dst->op_params)[1];
-    float max_bias;
-    memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
-
-    //GGML_ASSERT(ne01 + n_past == ne00);
-    GGML_ASSERT(n_head == ne02);
-
-    const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
-
-    const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
-    const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
-
-    alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
-
-    (void) src1;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_im2col(const ggml_tensor *src0,
-                                const ggml_tensor *src1, ggml_tensor *dst,
-                                const float *src0_dd, const float *src1_dd,
-                                float *dst_dd,
-                                const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F16);
-
-    const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
-    const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
-    const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
-    const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
-    const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
-    const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
-
-    const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
-
-    const int64_t IC = src1->ne[is_2D ? 2 : 1];
-    const int64_t IH = is_2D ? src1->ne[1] : 1;
-    const int64_t IW =         src1->ne[0];
-
-    const int64_t KH = is_2D ? src0->ne[1] : 1;
-    const int64_t KW =         src0->ne[0];
-
-    const int64_t OH = is_2D ? dst->ne[2] : 1;
-    const int64_t OW =         dst->ne[1];
-
-    const size_t delta_offset = src1->nb[is_2D ? 2 : 1] / 4; // nb is byte offset, src is type float32
-
-    im2col_f32_f16_cuda(src1_dd, (sycl::half *)dst_dd, IW, IH, OW, OH, KW, KH,
-                        IC, delta_offset, s0, s1, p0, p1, d0, d1, main_stream);
-
-    (void) src0;
-    (void) src0_dd;
-}
-
-inline void ggml_cuda_op_sum_rows(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_argsort(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const float *src0_dd, const float *src1_dd,
-                                 float *dst_dd,
-                                 const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_I32);
-
-    const int64_t ncols = src0->ne[0];
-    const int64_t nrows = ggml_nrows(src0);
-
-    enum ggml_sort_order order = (enum ggml_sort_order) dst->op_params[0];
-
-    argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_diag_mask_inf(const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst, const float *src0_dd,
-                                       const float *src1_dd, float *dst_dd,
-                                       const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int nrows0 = ggml_nrows(src0);
-
-    const int n_past = ((int32_t *) dst->op_params)[0];
-
-    diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-inline void ggml_cuda_op_soft_max(const ggml_tensor *src0,
-                                  const ggml_tensor *src1, ggml_tensor *dst,
-                                  const float *src0_dd, const float *src1_dd,
-                                  float *dst_dd,
-                                  const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t nrows_x = ggml_nrows(src0);
-    const int64_t nrows_y = src1 ? ggml_nrows(src1) : 1;
-
-    float scale = 1.0f;
-    memcpy(&scale, dst->op_params, sizeof(float));
-
-    soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream);
-
-    (void) dst;
-}
-
-inline void ggml_cuda_op_scale(const ggml_tensor *src0, const ggml_tensor *src1,
-                               ggml_tensor *dst, const float *src0_dd,
-                               const float *src1_dd, float *dst_dd,
-                               const dpct::queue_ptr &main_stream) try {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float scale;
-    // HACK: support for ggml backend interface
-    if (src1->backend == GGML_BACKEND_CPU) {
-        scale = ((float *) src1->data)[0];
-    } else {
-        // TODO: pass pointer to kernel instead of copying to host
-        CUDA_CHECK(
-            DPCT_CHECK_ERROR(dpct::get_in_order_queue()
-                                 .memcpy(&scale, src1->data, sizeof(float))
-                                 .wait()));
-    }
-
-    scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
-    /*
-    DPCT1010:114: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    CUDA_CHECK(0);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-inline void ggml_cuda_op_clamp(const ggml_tensor *src0, const ggml_tensor *src1,
-                               ggml_tensor *dst, const float *src0_dd,
-                               const float *src1_dd, float *dst_dd,
-                               const dpct::queue_ptr &main_stream) {
-
-    GGML_ASSERT(src0->type == GGML_TYPE_F32);
-    GGML_ASSERT( dst->type == GGML_TYPE_F32);
-
-    float min;
-    float max;
-    memcpy(&min, dst->op_params, sizeof(float));
-    memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
-
-    clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream);
-    /*
-    DPCT1010:115: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    CUDA_CHECK(0);
-
-    (void) src1;
-    (void) dst;
-    (void) src1_dd;
-}
-
-static void ggml_cuda_op_flatten(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 const ggml_cuda_op_flatten_t op) try {
-    const int64_t nrows0 = ggml_nrows(src0);
-
-    const bool use_src1 = src1 != nullptr;
-    const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1;
-
-    GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(              dst->backend != GGML_BACKEND_GPU_SPLIT);
-
-    ggml_tensor_extra_gpu * src0_extra =            (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
-    ggml_tensor_extra_gpu * dst_extra  =            (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device =             src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
-    const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
-    const bool  dst_on_device =              dst->backend == GGML_BACKEND_GPU;
-
-    const bool src1_stays_on_host = use_src1 && dst->op == GGML_OP_SCALE;
-
-    // dd = data device
-    float * src0_ddf = nullptr;
-    float * src1_ddf = nullptr;
-    float *  dst_ddf = nullptr;
-
-    // as = actual size
-    size_t src0_asf = 0;
-    size_t src1_asf = 0;
-    size_t  dst_asf = 0;
-
-    ggml_cuda_set_device(g_main_device);
-    const dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    if (src0_on_device) {
-        src0_ddf = (float *) src0_extra->data_device[g_main_device];
-    } else {
-        src0_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_asf);
-        CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream));
-    }
-
-    if (use_src1 && !src1_stays_on_host) {
-        if (src1_on_device) {
-            src1_ddf = (float *) src1_extra->data_device[g_main_device];
-        } else {
-            src1_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf);
-            CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream));
-        }
-    }
-    if (dst_on_device) {
-        dst_ddf = (float *) dst_extra->data_device[g_main_device];
-    } else {
-        dst_ddf = (float *) ggml_cuda_pool_malloc(ggml_nbytes(dst), &dst_asf);
-    }
-
-    // do the computation
-    op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream);
-    /*
-    DPCT1010:116: SYCL uses exceptions to report errors and does not use the
-    error codes. The call was replaced with 0. You need to rewrite this code.
-    */
-    CUDA_CHECK(0);
-
-    // copy dst to host if necessary
-    if (!dst_on_device) {
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            main_stream->memcpy(dst->data, dst_ddf, ggml_nbytes(dst))));
-    }
-
-    if (src0_asf > 0) {
-        ggml_cuda_pool_free(src0_ddf, src0_asf);
-    }
-    if (src1_asf > 0) {
-        ggml_cuda_pool_free(src1_ddf, src1_asf);
-    }
-    if (dst_asf > 0) {
-        ggml_cuda_pool_free(dst_ddf, dst_asf);
-    }
-
-    if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            dpct::get_current_device().queues_wait_and_throw()));
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_set_peer_access(const int n_tokens) {
-    static bool peer_access_enabled = false;
-
-    const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE;
-
-    if (peer_access_enabled == enable_peer_access) {
-        return;
-    }
-
-#ifdef NDEBUG
-    for (int id = 0; id < g_device_count; ++id) {
-        CUDA_CHECK(ggml_cuda_set_device(id));
-
-        for (int id_other = 0; id_other < g_device_count; ++id_other) {
-            if (id == id_other) {
-                continue;
-            }
-            if (id != g_main_device && id_other != g_main_device) {
-                continue;
-            }
-
-            int can_access_peer=1;
-            // CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access_peer, id, id_other));
-            if (can_access_peer) {
-                if (enable_peer_access) {
-                    // CUDA_CHECK(cudaDeviceEnablePeerAccess(id_other, 0));
-                } else {
-                    // CUDA_CHECK(cudaDeviceDisablePeerAccess(id_other));
-                }
-            }
-        }
-    }
-#endif // NDEBUG
-
-    peer_access_enabled = enable_peer_access;
-}
-
-static void ggml_cuda_op_mul_mat(const ggml_tensor *src0,
-                                 const ggml_tensor *src1, ggml_tensor *dst,
-                                 ggml_cuda_op_mul_mat_t op,
-                                 const bool convert_src1_to_q8_1) try {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-    const int64_t nrows0 = ggml_nrows(src0);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-    const int64_t nrows1 = ggml_nrows(src1);
-
-    GGML_ASSERT(ne03 == ne13);
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    const int nb2 = dst->nb[2];
-    const int nb3 = dst->nb[3];
-
-    ggml_cuda_set_peer_access(ne11);
-
-    GGML_ASSERT(dst->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src1->backend != GGML_BACKEND_GPU_SPLIT);
-
-    GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0);
-
-    const int64_t i02_divisor = ne12 / ne02;
-
-    const size_t src0_ts = ggml_type_size(src0->type);
-    const size_t src0_bs = ggml_blck_size(src0->type);
-    const size_t q8_1_ts = sizeof(block_q8_1);
-    const size_t q8_1_bs = QK8_1;
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    ggml_tensor_extra_gpu *  dst_extra = (ggml_tensor_extra_gpu *)  dst->extra;
-
-    const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
-    const bool src0_is_contiguous = ggml_is_contiguous(src0);
-    const bool src1_is_contiguous = ggml_is_contiguous(src1);
-
-    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
-
-    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
-    GGML_ASSERT(!(split && ne02 > 1));
-    GGML_ASSERT(!(split && ne03 > 1));
-    GGML_ASSERT(!(split && ne02 < ne12));
-
-    // dd = data device
-    char  *  src0_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
-    float * src1_ddf[GGML_CUDA_MAX_DEVICES] = {nullptr}; // float
-    char  * src1_ddq[GGML_CUDA_MAX_DEVICES] = {nullptr}; // q8_1
-    float *   dst_dd[GGML_CUDA_MAX_DEVICES] = {nullptr};
-
-    // as = actual size
-    size_t  src0_as[GGML_CUDA_MAX_DEVICES] = {0};
-    size_t src1_asf[GGML_CUDA_MAX_DEVICES] = {0};
-    size_t src1_asq[GGML_CUDA_MAX_DEVICES] = {0};
-    size_t   dst_as[GGML_CUDA_MAX_DEVICES] = {0};
-
-    int64_t  row_low[GGML_CUDA_MAX_DEVICES];
-    int64_t row_high[GGML_CUDA_MAX_DEVICES];
-
-    int used_devices = 0;
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        // by default, use all rows
-        row_low[id]  = 0;
-        row_high[id] = ne01;
-
-        // for multi GPU, get the row boundaries from tensor split
-        // and round to mul_mat_q tile sizes
-        if (split) {
-            const int64_t rounding = get_row_rounding(src0->type);
-
-            if (id != 0) {
-                row_low[id]  = ne01*g_tensor_split[id];
-                row_low[id] -= row_low[id] % rounding;
-            }
-
-            if (id != g_device_count - 1) {
-                row_high[id]  = ne01*g_tensor_split[id + 1];
-                row_high[id] -= row_high[id] % rounding;
-            }
-        }
-    }
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
-            continue;
-        }
-
-        used_devices++;
-
-        const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
-        const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
-
-        ggml_cuda_set_device(id);
-        const dpct::queue_ptr stream = g_cudaStreams[id][0];
-
-        if (src0_on_device && src0_is_contiguous) {
-            src0_dd[id] = (char *) src0_extra->data_device[id];
-        } else {
-            // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0);
-            src0_dd[id] = (char *) ggml_cuda_pool_malloc(ggml_nbytes(src0), &src0_as[id]);
-        }
-
-        if (src1_on_device && src1_is_contiguous) {
-            src1_ddf[id] = (float *) src1_extra->data_device[id];
-        } else {
-            src1_ddf[id] = (float *) ggml_cuda_pool_malloc(ggml_nbytes(src1), &src1_asf[id]);
-        }
-
-        if (convert_src1_to_q8_1) {
-            src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
-
-            if (src1_on_device && src1_is_contiguous) {
-                quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
-                /*
-                DPCT1010:117: SYCL uses exceptions to report errors and does not
-                use the error codes. The call was replaced with 0. You need to
-                rewrite this code.
-                */
-                CUDA_CHECK(0);
-            }
-        }
-
-        if (dst_on_device) {
-            dst_dd[id] = (float *) dst_extra->data_device[id];
-        } else {
-            const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst);
-            dst_dd[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_as[id]);
-        }
-    }
-
-    // if multiple devices are used they need to wait for the main device
-    // here an event is recorded that signals that the main device has finished calculating the input data
-    if (split && used_devices > 1) {
-        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-        /*
-        DPCT1012:118: Detected kernel execution time measurement pattern and
-        generated an initial code for time measurements in SYCL. You can change
-        the way time is measured depending on your goals.
-        */
-        /*
-        DPCT1024:119: The original code returned the error code that was further
-        consumed by the program logic. This original code was replaced with 0.
-        You may need to rewrite the program logic consuming the error code.
-        */
-        src0_extra->events_ct1_0[g_main_device] =
-            std::chrono::steady_clock::now();
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            *src0_extra->events[g_main_device][0] =
-                g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier()));
-    }
-
-    const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
-    for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
-        const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
-        const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
-
-        for (int64_t id = 0; id < g_device_count; ++id) {
-            if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
-                continue;
-            }
-
-            const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
-            const bool  dst_on_device =  dst->backend == GGML_BACKEND_GPU && id == g_main_device;
-            const int64_t row_diff = row_high[id] - row_low[id];
-
-            ggml_cuda_set_device(id);
-            const dpct::queue_ptr stream = g_cudaStreams[id][is];
-
-            // wait for main GPU data if necessary
-            if (split && (id != g_main_device || is != 0)) {
-                // CUDA_CHECK(DPCT_CHECK_ERROR(stream->ext_oneapi_submit_barrier(
-                //     {*src0_extra->events[g_main_device][0]})));
-
-                CUDA_CHECK(DPCT_CHECK_ERROR(
-                    *src0_extra->events[g_main_device][0] = stream->ext_oneapi_submit_barrier()));
-
-            }
-
-            for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) {
-                const int64_t i03 = i0 / ne12;
-                const int64_t i02 = i0 % ne12;
-
-                const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs;
-
-                // for split tensors the data begins at i0 == i0_offset_low
-                char  *  src0_dd_i =  src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs;
-                float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10;
-                char  * src1_ddq_i = src1_ddq[id] +  src1_ddq_i_offset;
-                float *   dst_dd_i =   dst_dd[id] + (i0*ne1  + src1_col_0) * (dst_on_device ? ne0 : row_diff);
-
-                // the main device memory buffer can be on VRAM scratch, with space for all partial results
-                // in that case an offset on dst_ddf_i is needed
-                if (dst->backend == GGML_BACKEND_GPU && id == g_main_device) {
-                    dst_dd_i += row_low[id]; // offset is 0 if no tensor split
-                }
-
-                // copy src0, src1 to device if necessary
-                if (src1->backend == GGML_BACKEND_GPU && src1_is_contiguous) {
-                    if (id != g_main_device) {
-                        if (convert_src1_to_q8_1) {
-                            char * src1_ddq_i_source = src1_ddq[g_main_device] + src1_ddq_i_offset;
-                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
-                                src1_ddq_i, src1_ddq_i_source,
-                                src1_ncols * src1_padded_col_size * q8_1_ts /
-                                    q8_1_bs)));
-                        } else {
-                            float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device];
-                            src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10;
-                            CUDA_CHECK(DPCT_CHECK_ERROR(stream->memcpy(
-                                src1_ddf_i, src1_ddf_i_source,
-                                src1_ncols * ne10 * sizeof(float))));
-                        }
-                    }
-                } else if (src1->backend == GGML_BACKEND_CPU || (src1_on_device && !src1_is_contiguous)) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(
-                                   src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream));
-                } else {
-                    GGML_ASSERT(false);
-                }
-
-                if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
-                    quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
-                    /*
-                    DPCT1010:120: SYCL uses exceptions to report errors and does
-                    not use the error codes. The call was replaced with 0. You
-                    need to rewrite this code.
-                    */
-                    CUDA_CHECK(0);
-                }
-
-                if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
-                    CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
-                }
-
-                // do the computation
-                op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
-                   row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);
-                /*
-                DPCT1010:121: SYCL uses exceptions to report errors and does not
-                use the error codes. The call was replaced with 0. You need to
-                rewrite this code.
-                */
-                CUDA_CHECK(0);
-
-                // copy dst to host or other device if necessary
-                if (!dst_on_device) {
-                    void * dst_off_device;
-                    dpct::memcpy_direction kind;
-                    if (dst->backend == GGML_BACKEND_CPU) {
-                        dst_off_device = dst->data;
-                        kind = dpct::device_to_host;
-                    } else if (dst->backend == GGML_BACKEND_GPU) {
-                        dst_off_device = dst_extra->data_device[g_main_device];
-                        kind = dpct::device_to_device;
-                    } else {
-                        GGML_ASSERT(false);
-                    }
-                    if (split) {
-                        // src0 = weight matrix is saved as a transposed matrix for better memory layout.
-                        // dst is NOT transposed.
-                        // The outputs of matrix matrix multiplications can therefore NOT simply be concatenated for >1 GPU.
-                        // Instead they need to be copied to the correct slice in ne0 = dst row index.
-                        // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results.
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0 + row_low[id];
-                        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::async_dpct_memcpy(
-                            dhf_dst_i, ne0 * sizeof(float), dst_dd_i,
-                            row_diff * sizeof(float), row_diff * sizeof(float),
-                            src1_ncols, kind, *stream)));
-                    } else {
-                        float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3);
-                        GGML_ASSERT(dst->nb[1] == ne0*sizeof(float));
-                        dhf_dst_i += src1_col_0*ne0;
-                        CUDA_CHECK(DPCT_CHECK_ERROR(
-                            stream->memcpy(dhf_dst_i, dst_dd_i,
-                                           src1_ncols * ne0 * sizeof(float))));
-                    }
-                }
-
-                // add event for the main device to wait on until other device is done
-                if (split && (id != g_main_device || is != 0)) {
-                    /*
-                    DPCT1012:122: Detected kernel execution time measurement
-                    pattern and generated an initial code for time measurements
-                    in SYCL. You can change the way time is measured depending
-                    on your goals.
-                    */
-                    /*
-                    DPCT1024:123: The original code returned the error code that
-                    was further consumed by the program logic. This original
-                    code was replaced with 0. You may need to rewrite the
-                    program logic consuming the error code.
-                    */
-                    src0_extra->events_ct1_is[id] =
-                        std::chrono::steady_clock::now();
-                    CUDA_CHECK(DPCT_CHECK_ERROR(
-                        *src0_extra->events[id][is] =
-                            stream->ext_oneapi_submit_barrier()));
-                }
-            }
-        }
-    }
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
-            continue;
-        }
-        CUDA_CHECK(ggml_cuda_set_device(id));
-
-        // free buffers again when done
-        if (src0_as[id] > 0) {
-            ggml_cuda_pool_free(src0_dd[id], src0_as[id]);
-        }
-        if (src1_asf[id] > 0) {
-            ggml_cuda_pool_free(src1_ddf[id], src1_asf[id]);
-        }
-        if (src1_asq[id] > 0) {
-            ggml_cuda_pool_free(src1_ddq[id], src1_asq[id]);
-        }
-        if (dst_as[id] > 0) {
-            ggml_cuda_pool_free(dst_dd[id], dst_as[id]);
-        }
-    }
-
-    // main device waits for all other devices to be finished
-    if (split && g_device_count > 1) {
-        int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE;
-        is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS;
-
-        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-        for (int64_t id = 0; id < g_device_count; ++id) {
-            if (row_low[id] == row_high[id]) {
-                continue;
-            }
-            for (int64_t is = 0; is < is_max; ++is) {
-                CUDA_CHECK(DPCT_CHECK_ERROR(
-                    g_cudaStreams[g_main_device][0]->ext_oneapi_submit_barrier(
-                        {*src0_extra->events[id][is]})));
-            }
-        }
-    }
-
-    if (dst->backend == GGML_BACKEND_CPU) {
-        CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            dpct::get_current_device().queues_wait_and_throw()));
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
-}
-
-static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
-}
-
-static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
-}
-
-static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc);
-}
-
-static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul);
-}
-
-static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div);
-}
-
-static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu);
-}
-
-static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu);
-}
-
-static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick);
-}
-
-static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh);
-}
-
-static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu);
-}
-
-static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu);
-}
-
-static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr);
-}
-
-static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm);
-}
-
-static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm);
-}
-
-static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat);
-}
-
-static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale);
-}
-
-static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad);
-}
-
-static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm);
-}
-
-bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
-    if (!g_cublas_loaded) return false;
-
-    const int64_t ne10 = src1->ne[0];
-
-    const int64_t ne0 = dst->ne[0];
-    const int64_t ne1 = dst->ne[1];
-
-    // TODO: find the optimal values for these
-    return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
-            src1->type == GGML_TYPE_F32 &&
-             dst->type == GGML_TYPE_F32 &&
-            (ne0 >= 32 && ne1 >= 32 && ne10 >= 32);
-}
-
-static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor *src0,
-                                       const ggml_tensor *src1,
-                                       ggml_tensor *dst) try {
-    GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
-    GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor *src0,
-                                     const ggml_tensor *src1,
-                                     ggml_tensor *dst) try {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-    GGML_ASSERT(!ggml_is_permuted(src0));
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-
-    const int64_t ne12 = src1->ne[2];
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    const int64_t row_stride_x = nb01 / sizeof(sycl::half);
-    const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
-
-    ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void k_compute_batched_ptrs(const sycl::half *src0_as_f16,
-                                   const sycl::half *src1_as_f16,
-                                   sycl::half *dst_f16, const void **ptrs_src,
-                                   void **ptrs_dst, int ne12, int ne13,
-                                   int ne23, int nb02, int nb03, int nb12,
-                                   int nb13, int nb2, int nb3, int r2, int r3,
-                                   const sycl::nd_item<3> &item_ct1) {
-    int i13 = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
-              item_ct1.get_local_id(2);
-    int i12 = item_ct1.get_group(1) * item_ct1.get_local_range(1) +
-              item_ct1.get_local_id(1);
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int i03 = i13 / r3;
-    int i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)     dst_f16 + i12* nb2/2 + i13* nb3/2;
-}
-
-static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor *src0,
-                                                 const ggml_tensor *src1,
-                                                 ggml_tensor *dst) try {
-    GGML_ASSERT(!ggml_is_transposed(src0));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src0->type == GGML_TYPE_F16);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00);
-    const int64_t ne01 = src0->ne[1];
-    const int64_t ne02 = src0->ne[2];
-    const int64_t ne03 = src0->ne[3];
-
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
-    const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
-    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    CUBLAS_CHECK(
-        DPCT_CHECK_ERROR(g_cublas_handles[g_main_device] = main_stream));
-
-    ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    void * src0_ddq = src0_extra->data_device[g_main_device];
-    sycl::half *src0_as_f16 = (sycl::half *)src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
-
-    size_t src1_as = 0;
-    sycl::half *src1_as_f16 =
-        (sycl::half *)ggml_cuda_pool_malloc(ne1 * sizeof(sycl::half), &src1_as);
-    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
-
-    size_t dst_as = 0;
-    sycl::half *dst_f16 =
-        (sycl::half *)ggml_cuda_pool_malloc(ne * sizeof(sycl::half), &dst_as);
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    const sycl::half alpha_f16 = 1.0f;
-    const sycl::half beta_f16 = 0.0f;
-
-#if 0
-    // use cublasGemmEx
-    {
-        for (int i13 = 0; i13 < ne13; ++i13) {
-            for (int i12 = 0; i12 < ne12; ++i12) {
-                int i03 = i13 / r3;
-                int i02 = i12 / r2;
-
-                CUBLAS_CHECK(
-                        cublasGemmEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-                            ne01, ne11, ne10,
-                            &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2]   + i03*src0->nb[3]  , CUDA_R_16F, nb01/sizeof(half),
-                                        (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float),
-                            &beta_f16,  (      char *)     dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01,
-                            CUBLAS_COMPUTE_16F,
-                            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-            }
-        }
-    }
-#else
-    if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) {
-        // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-        // use cublasGemmStridedBatchedEx
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
-            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, &alpha_f16,
-            (const char *)src0_as_f16, dpct::library_data_t::real_half,
-            nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half),
-            (const char *)src1_as_f16, dpct::library_data_t::real_half,
-            nb11 / sizeof(float), src1->nb[2] / sizeof(float), &beta_f16,
-            (char *)dst_f16, dpct::library_data_t::real_half, ne01,
-            dst->nb[2] / sizeof(float), ne12 * ne13,
-            dpct::library_data_t::real_half)));
-    } else {
-        // use cublasGemmBatchedEx
-        const int ne23 = ne12*ne13;
-
-        const void ** ptrs_src = nullptr;
-              void ** ptrs_dst = nullptr;
-
-        size_t ptrs_src_s = 0;
-        size_t ptrs_dst_s = 0;
-
-        ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-        ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
-
-        sycl::range<3> block_dims(1, ne12, ne13);
-        /*
-        DPCT1049:62: The work-group size passed to the SYCL kernel may exceed
-        the limit. To get the device limit, query
-        info::device::max_work_group_size. Adjust the work-group size if needed.
-        */
-        {
-            dpct::has_capability_or_fail(main_stream->get_device(),
-                                         {sycl::aspect::fp16});
-            main_stream->submit([&](sycl::handler &cgh) {
-                int dst_nb_ct12 = dst->nb[2];
-                int dst_nb_ct13 = dst->nb[3];
-
-                cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims),
-                                 [=](sycl::nd_item<3> item_ct1) {
-                                     k_compute_batched_ptrs(
-                                         src0_as_f16, src1_as_f16, dst_f16,
-                                         ptrs_src, ptrs_dst, ne12, ne13, ne23,
-                                         nb02, nb03, nb12, nb13, dst_nb_ct12,
-                                         dst_nb_ct13, r2, r3, item_ct1);
-                                 });
-            });
-        }
-        /*
-        DPCT1010:124: SYCL uses exceptions to report errors and does not use the
-        error codes. The call was replaced with 0. You need to rewrite this
-        code.
-        */
-        CUDA_CHECK(0);
-
-        CUBLAS_CHECK(DPCT_CHECK_ERROR(dpct::gemm_batch(
-            *g_cublas_handles[g_main_device], oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, &alpha_f16,
-            (const void **)(ptrs_src + 0 * ne23),
-            dpct::library_data_t::real_half, nb01 / sizeof(sycl::half),
-            (const void **)(ptrs_src + 1 * ne23),
-            dpct::library_data_t::real_half, nb11 / sizeof(float), &beta_f16,
-            (void **)(ptrs_dst + 0 * ne23), dpct::library_data_t::real_half,
-            ne01, ne23, dpct::library_data_t::real_half)));
-
-        if (ptrs_src_s != 0) {
-            ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
-        }
-        if (ptrs_dst_s != 0) {
-            ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
-        }
-    }
-#endif
-
-    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-
-    ggml_cuda_pool_free(src1_as_f16, src1_as);
-    ggml_cuda_pool_free(dst_f16, dst_as);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    const bool all_on_device =
-        (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
-        (src1->backend == GGML_BACKEND_GPU) &&
-        ( dst->backend == GGML_BACKEND_GPU);
-
-    const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
-
-    int64_t min_compute_capability = INT_MAX;
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
-            min_compute_capability = g_compute_capabilities[id];
-        }
-    }
-
-#ifdef CUDA_USE_TENSOR_CORES
-    const bool use_tensor_cores = true;
-#else
-    const bool use_tensor_cores = false;
-#endif
-
-    // debug helpers
-    //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]);
-    //printf("      %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
-    //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
-
-    if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
-        // KQ single-batch
-        ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
-    } else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
-        // KQV single-batch
-        ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
-    } else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
-        // KQ + KQV multi-batch
-        ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
-    } else if (src0->type == GGML_TYPE_F32) {
-        ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
-    } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
-        if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
-#ifdef GGML_CUDA_FORCE_DMMV
-            const bool use_mul_mat_vec_q = false;
-#else
-            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
-#endif // GGML_CUDA_FORCE_DMMV
-
-            if (use_mul_mat_vec_q) {
-                // NOTE: this kernel does not support ggml_nrows(src1) > 1
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true);
-            } else {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false);
-            }
-        } else {
-            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
-
-            // when tensor cores are available, use them for large batch size
-            // ref: https://github.com/ggerganov/llama.cpp/pull/3776
-            if (use_tensor_cores && min_compute_capability >= CC_VOLTA && src1->ne[1] > MMQ_MAX_BATCH_SIZE) {
-                use_mul_mat_q = false;
-            }
-
-            if (use_mul_mat_q) {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true);
-            } else {
-                ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
-            }
-        }
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-#if 0
-template<typename ... Srcs>
-static __global__ void k_compute_batched_ptrs_id(
-        const void ** ptrs_src, void ** ptrs_dst,
-        int ne12, int ne13,
-        int ne23,
-        int nb02, int nb03,
-        int nb12, int nb13,
-        int nb2, int nb3,
-        int r2, int r3,
-        ggml_type src0_type, half * src0_as_f16, int64_t src0_ne,
-        const half * src1_f16, half * dst_f16,
-        const int32_t * ids, const int id,
-        Srcs... src0s) {
-
-    int i = ids[id];
-
-    half * src0_f16;
-    const void * srcs_ar[] = { (const half *) src0s... };
-    if (src0_type == GGML_TYPE_F16) {
-        src0_f16 = (half *) srcs_ar[i];
-    } else {
-        src0_f16 = src0_as_f16;
-        if (threadIdx.x == 0 && threadIdx.y == 0) {
-            const to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(src0_type);
-            to_fp16(srcs_ar[i], src0_f16, src0_ne, cudaStreamFireAndForget);
-        }
-    }
-
-    int i13 = blockIdx.x * blockDim.x + threadIdx.x;
-    int i12 = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (i13 >= ne13 || i12 >= ne12) {
-        return;
-    }
-
-    int i03 = i13 / r3;
-    int i02 = i12 / r2;
-
-    ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_f16 + i02*nb02   + i03*nb03;
-    ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_f16 + i12*nb12/2 + i13*nb13/2;
-    ptrs_dst[0*ne23 + i12 + i13*ne12] = (      char *)  dst_f16 + i12* nb2/2 + i13* nb3/2;
-}
-
-static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) {
-    const struct ggml_tensor * ids = dst->src[0];
-    const struct ggml_tensor * src1 = dst->src[1];
-    const struct ggml_tensor * src00 = dst->src[2];
-
-    const int id = dst->op_params[0];
-
-    GGML_ASSERT(!ggml_is_transposed(src00));
-    GGML_ASSERT(!ggml_is_transposed(src1));
-
-    GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-    const int64_t ne00 = src00->ne[0]; GGML_UNUSED(ne00);
-    const int64_t ne01 = src00->ne[1];
-    const int64_t ne02 = src00->ne[2];
-    const int64_t ne03 = src00->ne[3];
-
-    //const int64_t nb01 = src00->nb[1];
-    const int64_t nb02 = src00->nb[2]; GGML_UNUSED(nb02);
-    const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    const int64_t ne12 = src1->ne[2];
-    const int64_t ne13 = src1->ne[3];
-
-    //const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
-    const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
-
-    const int64_t ne1 = ggml_nelements(src1);
-    const int64_t ne  = ggml_nelements(dst);
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
-
-    CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream));
-
-    //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    //void * src0_ddq = src0_extra->data_device[g_main_device];
-    //half * src0_as_f16 = (half *) src0_ddq;
-
-    ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-    float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
-
-    ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
-    float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
-
-    // convert src1 to fp16
-    const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type);
-    GGML_ASSERT(to_fp16_cuda != nullptr);
-
-    size_t src1_as = 0;
-    half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as);
-    to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream);
-
-    size_t dst_as = 0;
-    half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as);
-
-    GGML_ASSERT(ne12 % ne02 == 0);
-    GGML_ASSERT(ne13 % ne03 == 0);
-
-    // broadcast factors
-    const int64_t r2 = ne12/ne02;
-    const int64_t r3 = ne13/ne03;
-
-    const half alpha_f16 = 1.0f;
-    const half beta_f16  = 0.0f;
-
-    // use cublasGemmBatchedEx
-    const int ne23 = ne12*ne13;
-
-    const void ** ptrs_src = nullptr;
-          void ** ptrs_dst = nullptr;
-
-    size_t ptrs_src_s = 0;
-    size_t ptrs_dst_s = 0;
-
-    ptrs_src = (const void **) ggml_cuda_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s);
-    ptrs_dst = (      void **) ggml_cuda_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s);
-
-    int64_t src0_ne = ggml_nelements(src00);
-    half * src0_as_f16 = nullptr;
-    size_t src0_as = 0;
-    if (src00->type != GGML_TYPE_F16) {
-        src0_as_f16 = (half *) ggml_cuda_pool_malloc(src0_ne * sizeof(half), &src0_as);
-    }
-
-    static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6");
-    dim3 block_dims(ne13, ne12);
-    k_compute_batched_ptrs_id<<<1, block_dims, 0, main_stream>>>(
-            ptrs_src, ptrs_dst,
-            ne12, ne13,
-            ne23,
-            ne00*ne01*sizeof(half), ne00*ne01*ne02*sizeof(half),
-            nb12, nb13,
-            dst->nb[2], dst->nb[3],
-            r2, r3,
-            src00->type, src0_as_f16, src0_ne,
-            src1_as_f16, dst_f16,
-            (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id,
-            dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr,
-            dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr
-    );
-    CUDA_CHECK(cudaGetLastError());
-
-    CUBLAS_CHECK(
-    cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N,
-            ne01, ne11, ne10,
-            &alpha_f16, (const void **) (ptrs_src + 0*ne23), CUDA_R_16F, ne00,
-                        (const void **) (ptrs_src + 1*ne23), CUDA_R_16F, ne10,
-            &beta_f16,  (      void **) (ptrs_dst + 0*ne23), CUDA_R_16F, ne01,
-            ne23,
-            CUBLAS_COMPUTE_16F,
-            CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-
-    if (src0_as != 0) {
-        ggml_cuda_pool_free(src0_as_f16, src0_as);
-    }
-    if (ptrs_src_s != 0) {
-        ggml_cuda_pool_free(ptrs_src, ptrs_src_s);
-    }
-    if (ptrs_dst_s != 0) {
-        ggml_cuda_pool_free(ptrs_dst, ptrs_dst_s);
-    }
-
-    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16);
-    to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream);
-
-    ggml_cuda_pool_free(src1_as_f16, src1_as);
-    ggml_cuda_pool_free(dst_f16, dst_as);
-}
-#endif
-
-static void ggml_cuda_mul_mat_id(const ggml_tensor *src0,
-                                 const ggml_tensor *src1,
-                                 ggml_tensor *dst) try {
-#if 0
-    ggml_cuda_mul_mat_id_cublas(dst);
-    // TODO: mmq/mmv support
-#endif
-
-    GGML_ASSERT(dst->backend == GGML_BACKEND_GPU);
-
-    const struct ggml_tensor * ids = src0;
-    const int32_t id = ((int32_t *) dst->op_params)[0];
-    const int32_t n_as = ((int32_t *) dst->op_params)[1];
-
-    std::vector<char> ids_host(ggml_nbytes(ids));
-
-    if (ids->backend == GGML_BACKEND_GPU) {
-        const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device];
-        CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[g_main_device][0]->memcpy(
-            ids_host.data(), ids_dev, ggml_nbytes(ids))));
-        CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[g_main_device][0]->wait()));
-    } else {
-        memcpy(ids_host.data(), ids->data, ggml_nbytes(ids));
-    }
-
-    const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra;
-    const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra;
-
-    ggml_tensor_extra_gpu src1_row_extra;
-    ggml_tensor_extra_gpu dst_row_extra;
-
-    ggml_tensor src1_row = *src1;
-    ggml_tensor dst_row = *dst;
-
-    src1_row.ne[1] = 1;
-    dst_row.ne[1] = 1;
-
-    src1_row.nb[2] = src1_row.nb[1];
-    dst_row.nb[2] = dst_row.nb[1];
-
-    src1_row.nb[3] = src1_row.nb[1];
-    dst_row.nb[3] = dst_row.nb[1];
-
-    src1_row.extra = &src1_row_extra;
-    dst_row.extra = &dst_row_extra;
-
-
-    for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) {
-        //int32_t row_id;
-        //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
-        //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
-
-        const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]);
-
-        GGML_ASSERT(row_id >= 0 && row_id < n_as);
-
-        const struct ggml_tensor * src0_row = dst->src[row_id + 2];
-
-        src1_row_extra.data_device[g_main_device] = (char *) src1_extra->data_device[g_main_device] + i01*src1->nb[1];
-        src1_row.data = (char *) src1->data + i01*src1->nb[1];
-
-        dst_row_extra.data_device[g_main_device] = (char *) dst_extra->data_device[g_main_device] + i01*dst->nb[1];
-        dst_row.data = (char *) dst->data + i01*dst->nb[1];
-
-        ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row);
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale);
-}
-
-static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp);
-}
-
-static void ggml_cuda_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
-                          ggml_tensor *dst) try {
-    const int64_t ne = ggml_nelements(src0);
-    GGML_ASSERT(ne == ggml_nelements(src1));
-
-    GGML_ASSERT(src0->backend == GGML_BACKEND_GPU);
-    GGML_ASSERT(src1->backend == GGML_BACKEND_GPU);
-
-    GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
-    GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t ne01 = src0->ne[1];
-    GGML_ASSERT(src0->ne[3] == 1);
-
-    const int64_t nb00 = src0->nb[0];
-    const int64_t nb01 = src0->nb[1];
-    const int64_t nb02 = src0->nb[2];
-
-    const int64_t ne10 = src1->ne[0];
-    const int64_t ne11 = src1->ne[1];
-    GGML_ASSERT(src1->ne[3] == 1);
-
-    const int64_t nb10 = src1->nb[0];
-    const int64_t nb11 = src1->nb[1];
-    const int64_t nb12 = src1->nb[2];
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    dpct::queue_ptr main_stream = g_cudaStreams[g_main_device][0];
-
-    const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
-    const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
-
-    char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-    char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
-
-    if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
-        ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f32_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
-        ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
-        ggml_cpy_f32_q4_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
-        ggml_cpy_f32_q4_1_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
-        ggml_cpy_f16_f16_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
-    } else {
-        fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
-                ggml_type_name(src0->type), ggml_type_name(src1->type));
-        GGML_ASSERT(false);
-    }
-
-    (void) dst;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    // TODO: why do we pass dst as src1 here?
-    ggml_cuda_cpy(src0, dst, nullptr);
-    (void) src1;
-}
-
-static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf);
-}
-
-static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max);
-}
-
-static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope);
-}
-
-static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi);
-}
-
-static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col);
-}
-
-static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows);
-}
-
-static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(src0));
-    ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort);
-}
-
-static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    (void) src0;
-    (void) src1;
-    (void) dst;
-}
-
-static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) {
-    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
-    return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
-}
-
-void ggml_cuda_transform_tensor(void *data, struct ggml_tensor *tensor) try {
-    const int64_t nrows = ggml_nrows(tensor);
-
-    const int64_t ne0 = tensor->ne[0];
-
-    const size_t nb1 = tensor->nb[1];
-
-    ggml_backend_type backend = tensor->backend;
-    ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
-    memset(extra, 0, sizeof(*extra));
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (backend == GGML_BACKEND_GPU && id != g_main_device) {
-            continue;
-        }
-
-        ggml_cuda_set_device(id);
-
-        int64_t row_low, row_high;
-        if (backend == GGML_BACKEND_GPU) {
-            row_low = 0;
-            row_high = nrows;
-        } else if (backend == GGML_BACKEND_GPU_SPLIT) {
-            const int64_t rounding = get_row_rounding(tensor->type);
-
-            row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
-            row_low -= row_low % rounding;
-
-            if (id == g_device_count - 1) {
-                row_high = nrows;
-            } else {
-                row_high = nrows*g_tensor_split[id + 1];
-                row_high -= row_high % rounding;
-            }
-        } else {
-            GGML_ASSERT(false);
-        }
-        if (row_low == row_high) {
-            continue;
-        }
-
-        int64_t nrows_split = row_high - row_low;
-
-        const size_t offset_split = row_low*nb1;
-        size_t size = ggml_nbytes_split(tensor, nrows_split);
-        const size_t original_size = size;
-
-        // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-
-        char * buf;
-        CUDA_CHECK(DPCT_CHECK_ERROR(buf = (char *)sycl::malloc_device(
-                                        size, dpct::get_in_order_queue())));
-        char * buf_host = (char*)data + offset_split;
-
-        // set padding to 0 to avoid possible NaN values
-        if (size > original_size) {
-            CUDA_CHECK(DPCT_CHECK_ERROR(
-                dpct::get_in_order_queue()
-                    .memset(buf + original_size, 0, size - original_size)
-                    .wait()));
-        }
-
-        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
-                                        .memcpy(buf, buf_host, original_size)
-                                        .wait()));
-
-        extra->data_device[id] = buf;
-
-        if (backend == GGML_BACKEND_GPU_SPLIT) {
-            for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-                CUDA_CHECK(DPCT_CHECK_ERROR(extra->events[id][is] =
-                                                new sycl::event()));
-            }
-        }
-    }
-
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_free_data(struct ggml_tensor *tensor) try {
-    if (!tensor || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
-    for (int64_t id = 0; id < g_device_count; ++id) {
-        if (extra->data_device[id] != nullptr) {
-            CUDA_CHECK(ggml_cuda_set_device(id));
-            CUDA_CHECK(DPCT_CHECK_ERROR(sycl::free(
-                extra->data_device[id], dpct::get_in_order_queue())));
-        }
-
-        for (int64_t is = 0; is < MAX_STREAMS; ++is) {
-            if (extra->events[id][is] != nullptr) {
-                CUDA_CHECK(ggml_cuda_set_device(id));
-                CUDA_CHECK(DPCT_CHECK_ERROR(
-                    dpct::destroy_event(extra->events[id][is])));
-            }
-        }
-    }
-
-    delete extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
-static size_t g_temp_tensor_extra_index = 0;
-
-static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
-    if (g_temp_tensor_extras == nullptr) {
-        g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
-    }
-
-    size_t alloc_index = g_temp_tensor_extra_index;
-    g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
-    ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
-    memset(extra, 0, sizeof(*extra));
-
-    return extra;
-}
-
-static void ggml_cuda_assign_buffers_impl(struct ggml_tensor *tensor,
-                                          bool scratch, bool force_inplace,
-                                          bool no_alloc) try {
-    if (scratch && g_scratch_size == 0) {
-        return;
-    }
-
-    tensor->backend = GGML_BACKEND_GPU;
-
-    // recursively assign CUDA buffers until a compute tensor is found
-    if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
-        const ggml_op src0_op = tensor->src[0]->op;
-        if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
-            ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
-        }
-    }
-    if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
-        ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
-    }
-
-    if (scratch && no_alloc) {
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra;
-
-    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
-        tensor->op == GGML_OP_VIEW ||
-        force_inplace;
-    const size_t size = ggml_nbytes(tensor);
-
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-        size_t offset = 0;
-        if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&offset, tensor->op_params, sizeof(size_t));
-        }
-        extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = src0_ddc + offset;
-    } else if (tensor->op == GGML_OP_CPY) {
-        ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
-        void * src1_ddv = src1_extra->data_device[g_main_device];
-        extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = src1_ddv;
-    } else if (scratch) {
-        GGML_ASSERT(size <= g_scratch_size);
-        if (g_scratch_offset + size > g_scratch_size) {
-            g_scratch_offset = 0;
-        }
-
-        char * data = (char *) g_scratch_buffer;
-        if (data == nullptr) {
-            CUDA_CHECK(DPCT_CHECK_ERROR(
-                data = (char *)sycl::malloc_device(
-                    g_scratch_size, dpct::get_in_order_queue())));
-            g_scratch_buffer = data;
-        }
-        extra = ggml_cuda_alloc_temp_tensor_extra();
-        extra->data_device[g_main_device] = data + g_scratch_offset;
-
-        g_scratch_offset += size;
-
-        GGML_ASSERT(g_scratch_offset <= g_scratch_size);
-    } else { // allocate new buffers outside of scratch
-        void * data;
-        CUDA_CHECK(DPCT_CHECK_ERROR(data = (void *)sycl::malloc_device(
-                                        size, dpct::get_in_order_queue())));
-        CUDA_CHECK(DPCT_CHECK_ERROR(
-            dpct::get_in_order_queue().memset(data, 0, size).wait()));
-        extra = new ggml_tensor_extra_gpu;
-        memset(extra, 0, sizeof(*extra));
-        extra->data_device[g_main_device] = data;
-    }
-
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_assign_scratch_offset(struct ggml_tensor *tensor,
-                                     size_t offset) try {
-    if (g_scratch_size == 0) {
-        return;
-    }
-    if (g_scratch_buffer == nullptr) {
-        ggml_cuda_set_device(g_main_device);
-        CUDA_CHECK(
-            DPCT_CHECK_ERROR(g_scratch_buffer = (void *)sycl::malloc_device(
-                                 g_scratch_size, dpct::get_in_order_queue())));
-    }
-
-    ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
-
-    const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
-        tensor->op == GGML_OP_VIEW;
-
-    if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
-        ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
-        char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
-        size_t view_offset = 0;
-        if (tensor->op == GGML_OP_VIEW) {
-            memcpy(&view_offset, tensor->op_params, sizeof(size_t));
-        }
-        extra->data_device[g_main_device] = src0_ddc + view_offset;
-    } else {
-        extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
-    }
-
-    tensor->extra = extra;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_copy_to_device(struct ggml_tensor *tensor) try {
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-    GGML_ASSERT(ggml_is_contiguous(tensor));
-
-    ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-    CUDA_CHECK(ggml_cuda_set_device(g_main_device));
-    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_in_order_queue()
-                                    .memcpy(extra->data_device[g_main_device],
-                                            tensor->data, ggml_nbytes(tensor))
-                                    .wait()));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, true, false, false);
-}
-
-void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, true, false, true);
-}
-
-void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, false, false);
-}
-
-void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
-    ggml_cuda_assign_buffers_impl(tensor, false, true, false);
-}
-
-void ggml_cuda_set_main_device(const int main_device) try {
-    if (main_device >= g_device_count) {
-        fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
-                main_device, g_device_count, g_main_device);
-        return;
-    }
-
-    if (g_main_device != main_device && g_device_count > 1) {
-        g_main_device = main_device;
-        dpct::device_info prop;
-        CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(g_main_device))));
-        fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__,
-                g_main_device, prop.get_name());
-    }
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_set_scratch_size(const size_t scratch_size) {
-    // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
-    // it still won't always work as expected, but it's better than nothing
-    if (scratch_size > g_scratch_size) {
-        ggml_cuda_free_scratch();
-    }
-    g_scratch_size = std::max(g_scratch_size, scratch_size);
-}
-
-void ggml_cuda_free_scratch() try {
-    if (g_scratch_buffer == nullptr) {
-        return;
-    }
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(
-        sycl::free(g_scratch_buffer, dpct::get_in_order_queue())));
-    g_scratch_buffer = nullptr;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
-    if (!g_cublas_loaded) return false;
-
-    ggml_cuda_func_t func;
-    const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
-        || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
-        || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
-
-    if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
-        return false;
-    }
-
-    if (tensor->op == GGML_OP_MUL_MAT) {
-        if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
-#ifndef NDEBUG
-            fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = " PRId64 ", src1->ne[3] = " PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]);
-#endif
-            return false;
-        }
-    }
-
-    switch (tensor->op) {
-        case GGML_OP_REPEAT:
-            func = ggml_cuda_repeat;
-            break;
-        case GGML_OP_GET_ROWS:
-            func = ggml_cuda_get_rows;
-            break;
-        case GGML_OP_DUP:
-            func = ggml_cuda_dup;
-            break;
-        case GGML_OP_ADD:
-            func = ggml_cuda_add;
-            break;
-        case GGML_OP_ACC:
-            func = ggml_cuda_acc;
-            break;
-        case GGML_OP_MUL:
-            func = ggml_cuda_mul;
-            break;
-        case GGML_OP_DIV:
-            func = ggml_cuda_div;
-            break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(tensor)) {
-                case GGML_UNARY_OP_GELU:
-                    func = ggml_cuda_gelu;
-                    break;
-                case GGML_UNARY_OP_SILU:
-                    func = ggml_cuda_silu;
-                    break;
-                case GGML_UNARY_OP_GELU_QUICK:
-                    func = ggml_cuda_gelu_quick;
-                    break;
-                case GGML_UNARY_OP_TANH:
-                    func = ggml_cuda_tanh;
-                    break;
-                case GGML_UNARY_OP_RELU:
-                    func = ggml_cuda_relu;
-                    break;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_NORM:
-            func = ggml_cuda_norm;
-            break;
-        case GGML_OP_GROUP_NORM:
-            func = ggml_cuda_group_norm;
-            break;
-        case GGML_OP_CONCAT:
-            func = ggml_cuda_concat;
-            break;
-        case GGML_OP_UPSCALE:
-            func = ggml_cuda_upscale;
-            break;
-        case GGML_OP_PAD:
-            func = ggml_cuda_pad;
-            break;
-        case GGML_OP_LEAKY_RELU:
-            func = ggml_cuda_leaky_relu;
-            break;
-        case GGML_OP_RMS_NORM:
-            func = ggml_cuda_rms_norm;
-            break;
-        case GGML_OP_MUL_MAT:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_cuda_mul_mat;
-            break;
-        case GGML_OP_MUL_MAT_ID:
-            if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) {
-                return false;
-            }
-            func = ggml_cuda_mul_mat_id;
-            break;
-        case GGML_OP_SCALE:
-            func = ggml_cuda_scale;
-            break;
-        case GGML_OP_SQR:
-            func = ggml_cuda_sqr;
-            break;
-        case GGML_OP_CLAMP:
-            func = ggml_cuda_clamp;
-            break;
-        case GGML_OP_CPY:
-            func = ggml_cuda_cpy;
-            break;
-        case GGML_OP_CONT:
-            func = ggml_cuda_dup;
-            break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            func = ggml_cuda_nop;
-            break;
-        case GGML_OP_DIAG_MASK_INF:
-            func = ggml_cuda_diag_mask_inf;
-            break;
-        case GGML_OP_SOFT_MAX:
-            func = ggml_cuda_soft_max;
-            break;
-        case GGML_OP_ROPE:
-            func = ggml_cuda_rope;
-            break;
-        case GGML_OP_ALIBI:
-            func = ggml_cuda_alibi;
-            break;
-        case GGML_OP_IM2COL:
-            func = ggml_cuda_im2col;
-            break;
-        case GGML_OP_SUM_ROWS:
-            func = ggml_cuda_sum_rows;
-            break;
-        case GGML_OP_ARGSORT:
-            func = ggml_cuda_argsort;
-            break;
-        default:
-            return false;
-    }
-
-    if (params->ith != 0) {
-        return true;
-    }
-    if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
-        return true;
-    }
-    func(tensor->src[0], tensor->src[1], tensor);
-    return true;
-}
-
-int ggml_cuda_get_device_count() try {
-    int device_count;
-    if (DPCT_CHECK_ERROR(device_count =
-                             dpct::dev_mgr::instance().device_count()) != 0) {
-        return 0;
-    }
-    return device_count;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-void ggml_cuda_get_device_description(int device, char *description,
-                                      size_t description_size) try {
-    dpct::device_info prop;
-    CUDA_CHECK(DPCT_CHECK_ERROR(dpct::get_device_info(
-        prop, dpct::dev_mgr::instance().get_device(device))));
-    snprintf(description, description_size, "%s", prop.get_name());
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-////////////////////////////////////////////////////////////////////////////////
-
-// backend interface
-
-#define UNUSED GGML_UNUSED
-
-// cuda buffer
-
-struct ggml_backend_buffer_context_cuda {
-    int device;
-    void * dev_ptr = nullptr;
-    ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
-    size_t temp_tensor_extra_index = 0;
-
-    ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
-
-    ~ggml_backend_buffer_context_cuda() {
-        delete[] temp_tensor_extras;
-    }
-
-    ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
-        if (temp_tensor_extras == nullptr) {
-            temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
-        }
-
-        size_t alloc_index = temp_tensor_extra_index;
-        temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
-        ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
-        memset(extra, 0, sizeof(*extra));
-
-        return extra;
-    }
-};
-
-static void
-ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
-    delete ctx;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-    return ctx->dev_ptr;
-}
-
-static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer,
-                                                 ggml_tensor *tensor) try {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-
-    if (tensor->view_src != NULL && tensor->view_offs == 0) {
-        assert(tensor->view_src->buffer->buft == buffer->buft); // TODO
-        tensor->backend = tensor->view_src->backend;
-        tensor->extra = tensor->view_src->extra;
-        return;
-    }
-
-    ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
-
-    extra->data_device[ctx->device] = tensor->data;
-
-    tensor->backend = GGML_BACKEND_GPU;
-    tensor->extra = extra;
-
-    if (ggml_is_quantized(tensor->type)) {
-        // initialize padding to 0 to avoid possible NaN values
-        int64_t row_low = 0;
-        int64_t row_high = ggml_nrows(tensor);
-        int64_t nrows_split = row_high - row_low;
-
-        size_t original_size = ggml_nbytes_split(tensor, nrows_split);
-        size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
-
-        if (padded_size > original_size && tensor->view_src == nullptr) {
-            CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[ctx->device][0]->memset(
-                (char *)tensor->data + original_size, 0,
-                padded_size - original_size)));
-        }
-    }
-
-    UNUSED(buffer);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer,
-                                                ggml_tensor *tensor,
-                                                const void *data, size_t offset,
-                                                size_t size) try {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(dpct::get_in_order_queue()
-                             .memcpy((char *)tensor->data + offset, data, size)
-                             .wait()));
-
-    UNUSED(buffer);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer,
-                                                const ggml_tensor *tensor,
-                                                void *data, size_t offset,
-                                                size_t size) try {
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(
-        dpct::get_in_order_queue()
-            .memcpy(data, (const char *)tensor->data + offset, size)
-            .wait()));
-
-    UNUSED(buffer);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
-    /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
-    /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
-    /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
-    /* .set_tensor      = */ ggml_backend_cuda_buffer_set_tensor,
-    /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
-    /* .cpy_tensor_from = */ NULL,
-    /* .cpy_tensor_to   = */ NULL,
-};
-
-// cuda buffer type
-
-static ggml_backend_buffer_t
-ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
-                                           size_t size) try {
-    int device = (int) (intptr_t) buft->context;
-
-    ggml_cuda_set_device(device);
-
-    size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
-
-    void * dev_ptr;
-    CUDA_CHECK(DPCT_CHECK_ERROR(dev_ptr = (void *)sycl::malloc_device(
-                                    size, dpct::get_in_order_queue())));
-
-    ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
-
-    return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return 128;
-
-    UNUSED(buft);
-}
-
-static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
-    int64_t row_low = 0;
-    int64_t row_high = ggml_nrows(tensor);
-    int64_t nrows_split = row_high - row_low;
-
-    size_t size = ggml_nbytes_split(tensor, nrows_split);
-
-    int64_t ne0 = tensor->ne[0];
-
-    if (ggml_is_quantized(tensor->type)) {
-        if (ne0 % MATRIX_ROW_PADDING != 0) {
-            size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
-        }
-    }
-
-    return size;
-
-    UNUSED(buft);
-}
-
-static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
-    return ggml_backend_is_cuda(backend);
-
-    UNUSED(buft);
-}
-
-static ggml_backend_buffer_type_i cuda_backend_buffer_type_interface = {
-    /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
-    /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
-};
-
-ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda[GGML_CUDA_MAX_DEVICES];
-    static bool ggml_backend_buffer_type_cuda_initialized = false;
-    if (!ggml_backend_buffer_type_cuda_initialized) {
-        for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
-            ggml_backend_buffer_type_cuda[i] = {
-                /* .iface    = */ cuda_backend_buffer_type_interface,
-                /* .context  = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
-            };
-        }
-        ggml_backend_buffer_type_cuda_initialized = true;
-    }
-
-    return &ggml_backend_buffer_type_cuda[device];
-}
-
-// host buffer type
-
-static void
-ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
-    ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
-    CUDA_CHECK(
-        DPCT_CHECK_ERROR(sycl::free(ctx->dev_ptr, dpct::get_in_order_queue())));
-    delete ctx;
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static ggml_backend_buffer_t
-ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
-                                                size_t size) try {
-    void * ptr;
-    CUDA_CHECK(DPCT_CHECK_ERROR(
-        ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue())));
-
-    // FIXME: this is a hack to avoid having to implement a new buffer type
-    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
-    buffer->buft = buft;
-    buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
-
-    return buffer;
-
-    UNUSED(buft);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-struct ggml_backend_buffer_type_i cuda_backend_host_buffer_type_interface = {
-    /* .alloc_buffer     = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
-    /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
-    /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
-    /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
-};
-
-ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_cuda_host = {
-        /* .iface    = */ cuda_backend_host_buffer_type_interface,
-        /* .context  = */ nullptr,
-    };
-
-    return &ggml_backend_buffer_type_cuda_host;
-}
-
-// backend
-
-struct ggml_backend_context_cuda {
-    int device;
-};
-
-static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
-    return GGML_CUDA_NAME;
-
-    UNUSED(backend);
-}
-
-static void ggml_backend_cuda_free(ggml_backend_t backend) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    delete cuda_ctx;
-    delete backend;
-}
-
-static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    return ggml_backend_cuda_buffer_type(cuda_ctx->device);
-}
-
-static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend,
-                                               ggml_tensor *tensor,
-                                               const void *data, size_t offset,
-                                               size_t size) try {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
-        (char *)tensor->data + offset, data, size)));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend,
-                                               const ggml_tensor *tensor,
-                                               void *data, size_t offset,
-                                               size_t size) try {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
-    GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
-    GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
-    GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->memcpy(
-        data, (const char *)tensor->data + offset, size)));
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static void ggml_backend_cuda_synchronize(ggml_backend_t backend) try {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    CUDA_CHECK(DPCT_CHECK_ERROR(g_cudaStreams[cuda_ctx->device][0]->wait()));
-
-    UNUSED(backend);
-}
-catch (sycl::exception const &exc) {
-  std::cerr << exc.what() << "Exception caught at file:" << __FILE__
-            << ", line:" << __LINE__ << std::endl;
-  std::exit(1);
-}
-
-static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    GGML_ASSERT(!"not implemented");
-
-    return nullptr;
-
-    UNUSED(backend);
-    UNUSED(cgraph);
-}
-
-static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-
-    UNUSED(backend);
-    UNUSED(plan);
-}
-
-static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
-    GGML_ASSERT(!"not implemented");
-
-    UNUSED(backend);
-    UNUSED(plan);
-}
-
-static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
-    ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
-    ggml_cuda_set_main_device(cuda_ctx->device);
-
-    ggml_compute_params params = {};
-    params.type = GGML_TASK_COMPUTE;
-    params.ith = 0;
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-
-        if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
-            continue;
-
-        assert(node->backend == GGML_BACKEND_GPU);
-        assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
-        assert(node->extra != nullptr);
-
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (node->src[j] != nullptr) {
-                assert(node->src[j]->backend == GGML_BACKEND_GPU);
-                assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
-                assert(node->src[j]->extra != nullptr);
-            }
-        }
-
-        bool ok = ggml_cuda_compute_forward(&params, node);
-        if (!ok) {
-            fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-        }
-        GGML_ASSERT(ok);
-
-#if 0
-        if (node->type == GGML_TYPE_F32) {
-            cudaDeviceSynchronize();
-            std::vector<float> tmp(ggml_nelements(node), 0.0f);
-            cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
-            printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
-                ggml_type_name(node->src[0]->type),
-                node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
-                node->src[0]->name,
-                node->src[1] ? node->src[1]->name : "none");
-            double sum = 0.0;
-            double sq_sum = 0.0;
-            for (int i = 0; i < ggml_nelements(node); i++) {
-                printf("%f ", tmp[i]);
-                sum += tmp[i];
-                sq_sum += tmp[i]*tmp[i];
-            }
-            printf("\n");
-            printf("sum: %f, ", sum);
-            printf("sq_sum: %f\n", sq_sum);
-        }
-#endif
-    }
-
-    UNUSED(backend);
-}
-
-static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
-    switch (op->op) {
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(op)) {
-                case GGML_UNARY_OP_GELU:
-                case GGML_UNARY_OP_SILU:
-                case GGML_UNARY_OP_RELU:
-                case GGML_UNARY_OP_GELU_QUICK:
-                case GGML_UNARY_OP_TANH:
-                    return true;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_MUL_MAT:
-        case GGML_OP_MUL_MAT_ID:
-            {
-                struct ggml_tensor * a;
-                struct ggml_tensor * b;
-                if (op->op == GGML_OP_MUL_MAT) {
-                    a = op->src[0];
-                    b = op->src[1];
-                } else {
-                    a = op->src[2];
-                    b = op->src[1];
-                }
-                if (a->ne[3] != b->ne[3]) {
-                    return false;
-                }
-                return true;
-            } break;
-        case GGML_OP_GET_ROWS:
-            {
-                switch (op->src[0]->type) {
-                    case GGML_TYPE_F16:
-                    case GGML_TYPE_F32:
-                    case GGML_TYPE_Q4_0:
-                    case GGML_TYPE_Q4_1:
-                    case GGML_TYPE_Q5_0:
-                    case GGML_TYPE_Q5_1:
-                    case GGML_TYPE_Q8_0:
-                        return true;
-                    default:
-                        return false;
-                }
-            } break;
-        case GGML_OP_CPY:
-            {
-                ggml_type src0_type = op->src[0]->type;
-                ggml_type src1_type = op->src[1]->type;
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F32) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q8_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_0) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F32 && src1_type == GGML_TYPE_Q4_1) {
-                    return true;
-                }
-                if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) {
-                    return true;
-                }
-                return false;
-            } break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-        case GGML_OP_NORM:
-        case GGML_OP_REPEAT:
-        case GGML_OP_DUP:
-        case GGML_OP_ADD:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_RMS_NORM:
-        case GGML_OP_SCALE:
-        case GGML_OP_SQR:
-        case GGML_OP_CLAMP:
-        case GGML_OP_CONT:
-        case GGML_OP_DIAG_MASK_INF:
-        case GGML_OP_SOFT_MAX:
-        case GGML_OP_ROPE:
-        case GGML_OP_ALIBI:
-        case GGML_OP_IM2COL:
-        case GGML_OP_SUM_ROWS:
-        case GGML_OP_ARGSORT:
-        case GGML_OP_ACC:
-        case GGML_OP_CONCAT:
-        case GGML_OP_GROUP_NORM:
-        case GGML_OP_UPSCALE:
-        case GGML_OP_PAD:
-        case GGML_OP_LEAKY_RELU:
-            return true;
-        default:
-            return false;
-    }
-
-    UNUSED(backend);
-}
-
-static ggml_backend_i cuda_backend_i = {
-    /* .get_name                = */ ggml_backend_cuda_name,
-    /* .free                    = */ ggml_backend_cuda_free,
-    /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
-    /* .set_tensor_async        = */ ggml_backend_cuda_set_tensor_async,
-    /* .get_tensor_async        = */ ggml_backend_cuda_get_tensor_async,
-    /* .cpy_tensor_from_async   = */ NULL,
-    /* .cpy_tensor_to_async     = */ NULL,
-    /* .synchronize             = */ ggml_backend_cuda_synchronize,
-    /* .graph_plan_create       = */ ggml_backend_cuda_graph_plan_create,
-    /* .graph_plan_free         = */ ggml_backend_cuda_graph_plan_free,
-    /* .graph_plan_compute      = */ ggml_backend_cuda_graph_plan_compute,
-    /* .graph_compute           = */ ggml_backend_cuda_graph_compute,
-    /* .supports_op             = */ ggml_backend_cuda_supports_op,
-};
-
-ggml_backend_t ggml_backend_cuda_init(int device) {
-    ggml_init_cublas(); // TODO: remove from ggml.c
-
-    if (device < 0 || device >= ggml_cuda_get_device_count()) {
-        fprintf(stderr, "%s: error: invalid device %d\n", __func__, device);
-        return nullptr;
-    }
-
-    // not strictly necessary, but it may reduce the overhead of the first graph_compute
-    ggml_cuda_set_main_device(device);
-
-    ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
-        /* .device = */ device
-    };
-
-    ggml_backend_t cuda_backend = new ggml_backend {
-        /* .interface = */ cuda_backend_i,
-        /* .context   = */ ctx
-    };
-
-    return cuda_backend;
-}
-
-bool ggml_backend_is_cuda(ggml_backend_t backend) {
-    return backend->iface.get_name == ggml_backend_cuda_name;
-}
-
-static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
-    ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
-    return cuda_backend;
-
-    UNUSED(params);
-}
-
-extern "C" int ggml_backend_cuda_reg_devices();
-
-int ggml_backend_cuda_reg_devices() {
-    int device_count = ggml_cuda_get_device_count();
-    //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization
-    printf("device count=%d\n", device_count);
-    for (int i = 0; i < device_count; i++) {
-        char name[128];
-        snprintf(name, sizeof(name), "%s%d", GGML_CUDA_NAME, i);
-        printf("register name=%s\n", name);
-        ggml_backend_register(name, ggml_backend_reg_cuda_init, ggml_backend_cuda_buffer_type(i), (void *) (intptr_t) i);
-    }
-    return device_count;
-}

From 623d8031cbd37e829a4568e118eac3cc4226bc69 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Fri, 19 Jan 2024 10:26:19 +0800
Subject: [PATCH 36/90] fix code err

---
 ggml-sycl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 4de78a7666c28..3e7d9e3a15178 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -12062,7 +12062,7 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph
     }
 
     UNUSED(backend);
-    return;
+    return true;
 }
 
 static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) {

From f396a3b65ed6bd9858f04a1dd75c7607de8c8985 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sat, 20 Jan 2024 14:54:20 +0800
Subject: [PATCH 37/90] add know issue for pvc hang issue

---
 README_sycl.md | 32 ++++++++++++++++++++++++++++----
 1 file changed, 28 insertions(+), 4 deletions(-)

diff --git a/README_sycl.md b/README_sycl.md
index df84e99b2049e..8b70823b3b648 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -1,5 +1,19 @@
 # llama.cpp for SYCL
 
+[Background](#background)
+
+[OS](#os)
+
+[Intel GPU](#intel-gpu)
+
+[Linux](#linux)
+
+[Environment Variable](#environment-variable)
+
+[Known Issue](#known-issue)
+
+[Todo](#todo)
+
 ## Background
 
 SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
@@ -8,9 +22,6 @@ oneAPI is a specification that is open and standards-based, supporting multiple
 
 Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.
 
-
-## llama.cpp for SYCL
-
 To avoid to re-invent the wheel, this code refer other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
 
 The llama.cpp for SYCL is used to support Intel GPUs.
@@ -190,7 +201,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 ```
 
 
-### Environment Variable
+## Environment Variable
 
 #### Build
 
@@ -209,3 +220,16 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 |-|-|-|
 |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
 |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
+
+## Known Issue
+
+-  Hang during startup
+
+  llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+  Solution: add **--no-mmap**.
+
+## Todo
+
+- Support to build in Windows.
+
+- Support multiple cards.
\ No newline at end of file

From f008cc7b68e897debdf5353466dcc1f09b8c15fd Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Mon, 22 Jan 2024 13:41:09 +0800
Subject: [PATCH 38/90] enable SYCL_F16 support

---
 CMakeLists.txt |  6 ++++--
 README_sycl.md |  4 ++--
 ggml-sycl.cpp  | 14 +++++++++++---
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 035c66c08599e..6fe3488d9d443 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,6 +105,7 @@ option(LLAMA_SYCL                            "llama: use SYCL"
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
+option(LLAMA_SYCL_F16                        "llama: use 16 bit floats for sycl calculations"   OFF)
 
 option(LLAMA_BUILD_TESTS                     "llama: build tests"    ${LLAMA_STANDALONE})
 option(LLAMA_BUILD_EXAMPLES                  "llama: build examples" ${LLAMA_STANDALONE})
@@ -455,8 +456,9 @@ if (LLAMA_SYCL)
     #todo: AOT
 
     find_package(IntelSYCL REQUIRED)
-
-    #add_compile_definitions(GGML_SYCL_F16)
+    if (LLAMA_SYCL_F16)
+        add_compile_definitions(GGML_SYCL_F16)
+    endif()
     add_compile_definitions(GGML_USE_SYCL)
 
     add_compile_options(-I./) #include DPCT 
diff --git a/README_sycl.md b/README_sycl.md
index 8b70823b3b648..993155071e191 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -120,7 +120,7 @@ cd build
 source /opt/intel/oneapi/setvars.sh
 
 #for FP16
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
 
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
@@ -232,4 +232,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 
 - Support to build in Windows.
 
-- Support multiple cards.
\ No newline at end of file
+- Support multiple cards.
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 3e7d9e3a15178..9253c9534d24a 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -9297,9 +9297,17 @@ inline void ggml_sycl_op_mul_mat_sycl(
     int ldc = dst->backend == GGML_BACKEND_GPU && device_id == g_main_device ? ne0 : row_diff;
 
     const int compute_capability = g_device_caps[id].cc;
-
-    // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
-    if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+#ifdef GGML_SYCL_F16
+    bool use_fp16 = true;  // TODO(Yu) SYCL capability check
+#else
+    bool use_fp16 = false;
+#endif
+    // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 ||
+    // ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff ==
+    // src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
+    if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] &&
+        dst->op_params[0] == GGML_PREC_DEFAULT) {
 
         // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32
         // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n");

From 67e6b3cb7d7b14069031d40351e916e532e94dd2 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Tue, 23 Jan 2024 03:32:09 +0000
Subject: [PATCH 39/90] align pr4766

---
 common/common.cpp |  8 ++++-
 ggml-sycl.cpp     | 79 +++++++++++++++++++++++++++++++++++------------
 ggml-sycl.h       |  3 --
 llama.cpp         | 15 ++++-----
 4 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index b26daf2fdcef9..abe752352ba64 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -651,9 +651,15 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                     params.tensor_split[i] = 0.0f;
                 }
             }
-#ifndef GGML_USE_CLBLAS_SYCL
+#else
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
 #endif // GGML_USE_CLBLAS_SYCL
+        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
+            params.mul_mat_q = false;
+#else
+            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS or SYCL. Disabling mul_mat_q kernels has no effect.\n");
+#endif // GGML_USE_CUBLAS
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
         } else if (arg == "--numa") {
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 9253c9534d24a..4995115551c92 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -11621,17 +11621,23 @@ catch (sycl::exception const &exc) {
 
 #define UNUSED GGML_UNUSED
 
+struct ggml_backend_sycl_context {
+    int device;
+    std::string name;
+};
+
 // sycl buffer
 
-struct ggml_backend_buffer_context_sycl {
+struct ggml_backend_sycl_buffer_context {
     int device;
     void * dev_ptr = nullptr;
     ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
     size_t temp_tensor_extra_index = 0;
+    std::string name;
 
-    ggml_backend_buffer_context_sycl(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
+     ggml_backend_sycl_buffer_context(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
 
-    ~ggml_backend_buffer_context_sycl() {
+    ~ ggml_backend_sycl_buffer_context() {
         delete[] temp_tensor_extras;
     }
 
@@ -11649,9 +11655,18 @@ struct ggml_backend_buffer_context_sycl {
     }
 };
 
+GGML_CALL static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
+    ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
+    return ctx->name.c_str();
+}
+
+GGML_CALL static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
+    return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
+}
+
 static void
 ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try {
-    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
+     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
     ggml_sycl_set_device(ctx->device);
     int device_index = get_device_index_by_id(ctx->device);
     const dpct::queue_ptr stream = g_syclStreams[device_index][0];
@@ -11667,13 +11682,13 @@ catch (sycl::exception const &exc) {
 }
 
 static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
+     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
     return ctx->dev_ptr;
 }
 
 static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer,
                                                  ggml_tensor *tensor) try {
-    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
+     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
 
     if (tensor->view_src != NULL && tensor->view_offs == 0) {
         assert(tensor->view_src->buffer->buft == buffer->buft);
@@ -11719,7 +11734,7 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
                                                 size_t size) try {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
+     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
 
     ggml_sycl_set_device(ctx->device);
     int device_index = get_device_index_by_id(ctx->device);
@@ -11744,7 +11759,7 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer,
                                                 size_t size) try {
     GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
 
-    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
+     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
 
     ggml_sycl_set_device(ctx->device);
     int device_index = get_device_index_by_id(ctx->device);
@@ -11766,7 +11781,7 @@ catch (sycl::exception const &exc) {
 
 static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer,
                                            uint8_t value) try {
-    ggml_backend_buffer_context_sycl * ctx = (ggml_backend_buffer_context_sycl *)buffer->context;
+     ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context;
 
     ggml_sycl_set_device(ctx->device);
     int device_index = get_device_index_by_id(ctx->device);
@@ -11784,18 +11799,29 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static struct ggml_backend_buffer_i sycl_backend_buffer_interface = {
+static struct ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
+    /* .get_name        = */ ggml_backend_sycl_buffer_get_name,
     /* .free_buffer     = */ ggml_backend_sycl_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_sycl_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_sycl_buffer_init_tensor,
     /* .set_tensor      = */ ggml_backend_sycl_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_sycl_buffer_get_tensor,
-    /* .cpy_tensor_from = */ NULL,
-    /* .cpy_tensor_to   = */ NULL,
+    /* .cpy_tensor      = */ NULL,
     /* .clear           = */ ggml_backend_sycl_buffer_clear,
+    /* .reset           = */ NULL,
 };
 
 // sycl buffer type
+struct ggml_backend_sycl_buffer_type_context {
+    int device;
+    std::string name;
+};
+
+GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
+
+    return ctx->name.c_str();
+}
 
 static ggml_backend_buffer_t
 ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
@@ -11811,9 +11837,9 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
     SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
                                     size, *stream)));
 
-    ggml_backend_buffer_context_sycl * ctx = new ggml_backend_buffer_context_sycl(device, dev_ptr);
+     ggml_backend_sycl_buffer_context * ctx = new  ggml_backend_sycl_buffer_context(device, dev_ptr);
 
-    return ggml_backend_buffer_init(buft, sycl_backend_buffer_interface, ctx, size);
+    return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__
@@ -11827,7 +11853,7 @@ static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_ty
     UNUSED(buft);
 }
 
-static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
+static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
     int64_t row_low = 0;
     int64_t row_high = ggml_nrows(tensor);
     int64_t nrows_split = row_high - row_low;
@@ -11854,6 +11880,7 @@ static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_t
 }
 
 static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
+    /* .get_name         = */ ggml_backend_sycl_buffer_type_name,    
     /* .alloc_buffer     = */ ggml_backend_sycl_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_sycl_buffer_type_get_alignment,
     /* .get_alloc_size   = */ ggml_backend_sycl_buffer_type_get_alloc_size,
@@ -11881,6 +11908,18 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
 
 // host buffer type
 
+GGML_CALL static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return GGML_SYCL_NAME "_Host";
+
+    UNUSED(buft);
+}
+
+GGML_CALL static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
+    return GGML_SYCL_NAME "_Host";
+
+    UNUSED(buffer);
+}
+
 static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_sycl_host_free(buffer->context);
 }
@@ -11904,6 +11943,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm
 ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() {
     static struct ggml_backend_buffer_type ggml_backend_sycl_buffer_type_host = {
         /* .iface    = */ {
+            /* .get_name         = */ ggml_backend_sycl_host_buffer_type_name,
             /* .alloc_buffer     = */ ggml_backend_sycl_host_buffer_type_alloc_buffer,
             /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
             /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
@@ -11990,7 +12030,7 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static ggml_backend_graph_plan_t ggml_backend_sycl_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+static ggml_backend_graph_plan_t ggml_backend_sycl_graph_plan_create(ggml_backend_t backend, const ggml_cgraph * cgraph) {
     GGML_ASSERT(!"not implemented");
 
     return nullptr;
@@ -12180,14 +12220,13 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
     UNUSED(backend);
 }
 
-static ggml_backend_i sycl_backend_i = {
+static ggml_backend_i ggml_backend_sycl_interface = {
     /* .get_name                = */ ggml_backend_sycl_name,
     /* .free                    = */ ggml_backend_sycl_free,
     /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
     /* .set_tensor_async        = */ ggml_backend_sycl_set_tensor_async,
     /* .get_tensor_async        = */ ggml_backend_sycl_get_tensor_async,
-    /* .cpy_tensor_from_async   = */ NULL,
-    /* .cpy_tensor_to_async     = */ NULL,
+    /* .cpy_tensor_async        = */ NULL,
     /* .synchronize             = */ ggml_backend_sycl_synchronize,
     /* .graph_plan_create       = */ ggml_backend_sycl_graph_plan_create,
     /* .graph_plan_free         = */ ggml_backend_sycl_graph_plan_free,
@@ -12212,7 +12251,7 @@ ggml_backend_t ggml_backend_sycl_init(int device) {
     };
 
     ggml_backend_t sycl_backend = new ggml_backend {
-        /* .interface = */ sycl_backend_i,
+        /* .interface = */ ggml_backend_sycl_interface,
         /* .context   = */ ctx
     };
 
diff --git a/ggml-sycl.h b/ggml-sycl.h
index e4379a9876845..3dd2bc20533ea 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -2386,7 +2386,6 @@ namespace dpct
                      const void *beta, void *c, library_data_t c_type, int ldc,
                      library_data_t scaling_type)
     {
-        bool matched = false;
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
         {
@@ -2542,7 +2541,6 @@ namespace dpct
 #ifdef DPCT_USM_LEVEL_NONE
         throw std::runtime_error("this API is unsupported when USM level is none");
 #else
-        bool matched = false;
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
         {
@@ -2712,7 +2710,6 @@ namespace dpct
                            int ldc, long long int stride_c, int batch_size,
                            library_data_t scaling_type)
     {
-        bool matched = false;
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
         {
diff --git a/llama.cpp b/llama.cpp
index eb9426f444a2c..2dfff45c39c97 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6703,7 +6703,7 @@ static int llama_decode_internal(
     }
 
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
-    if ((ggml_cpu_has_cublas() || ggml_cpu_has_sycl()) && fully_offloaded) {
+    if (ggml_cpu_has_cublas() && fully_offloaded) {
         n_threads = 1;
     }
 
@@ -9939,13 +9939,14 @@ struct llama_context * llama_new_context_with_model(
         }
 #elif defined(GGML_USE_SYCL)
         if (model->n_gpu_layers > 0) {
-            ctx->backend = ggml_backend_sycl_init(0);
-            if (ctx->backend == nullptr) {
-                LLAMA_LOG_ERROR("%s: failed to initialize SYCL backend\n", __func__);
-            }
+                ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
+                if (backend == nullptr) {
+                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
+                    llama_free(ctx);
+                    return nullptr;
+                }
+                ctx->backends.push_back(backend);
         }
-
-
 #endif
         ctx->backend_cpu = ggml_backend_cpu_init();
         if (ctx->backend_cpu == nullptr) {

From 533c647d0ea3e73800e820fc5950b749e924cae5 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Tue, 23 Jan 2024 13:34:05 +0800
Subject: [PATCH 40/90] check for sycl blas, better performance

---
 llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index 2dfff45c39c97..d3c9eaa79a9ed 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6703,7 +6703,7 @@ static int llama_decode_internal(
     }
 
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
-    if (ggml_cpu_has_cublas() && fully_offloaded) {
+    if ((ggml_cpu_has_cublas() || ggml_cpu_has_sycl()) && fully_offloaded) {
         n_threads = 1;
     }
 

From dd7f1396f92362de27ab6833e596ced28bda1ef4 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Mon, 22 Jan 2024 21:37:16 -0800
Subject: [PATCH 41/90] cleanup 1

---
 common/common.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index abe752352ba64..faafc6ce55e7d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -659,7 +659,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             params.mul_mat_q = false;
 #else
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS or SYCL. Disabling mul_mat_q kernels has no effect.\n");
-#endif // GGML_USE_CUBLAS
+#endif // GGML_USE_CUBLAS || GGML_USE_SYCL
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
         } else if (arg == "--numa") {
@@ -1021,6 +1021,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n");
     printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
     printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
+#endif // LLAMA_SUPPORTS_GPU_OFFLOAD
 #ifdef GGML_USE_CLBLAS
     printf("  -nommq, --no-mul-mat-q\n");
     printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");

From b40378422889842641f64f0a26cd6999fed1b6b8 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Tue, 23 Jan 2024 06:09:19 +0000
Subject: [PATCH 42/90] remove extra endif

---
 common/common.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index faafc6ce55e7d..0091437e7fa49 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1032,7 +1032,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        use " GGML_SYCL_NAME " instead of custom mul_mat_q " GGML_SYCL_NAME " kernels.\n");
     printf("                        Not recommended since this is both slower and uses more VRAM.\n");
 #endif // GGML_USE_SYCL
-#endif
     printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
     printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
     printf("  -gan N, --grp-attn-n N\n");

From a0a1304b0c85714606e38ffdef1fda63ef1e24e0 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Tue, 23 Jan 2024 14:16:01 +0800
Subject: [PATCH 43/90] add build&run script, clean CMakefile, update guide by
 review comments

---
 CMakeLists.txt              | 10 ++++++----
 README_sycl.md              | 31 ++++++++++++++++++++++++-------
 examples/sycl/build.sh      | 15 +++++++++++++++
 examples/sycl/run_llama2.sh | 14 ++++++++++++++
 4 files changed, 59 insertions(+), 11 deletions(-)
 create mode 100755 examples/sycl/build.sh
 create mode 100755 examples/sycl/run_llama2.sh

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6fe3488d9d443..6d2168250b38b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -101,7 +101,6 @@ option(LLAMA_CLBLAST                         "llama: use CLBlast"
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
 option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
-option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
 option(LLAMA_MPI                             "llama: use MPI"                                   OFF)
 option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
 option(LLAMA_SYCL                            "llama: use SYCL"                                  OFF)
@@ -124,8 +123,12 @@ include(${CMAKE_CURRENT_SOURCE_DIR}/scripts/build-info.cmake)
 #
 # Compile flags
 #
+if (LLAMA_SYCL)
+    set(CMAKE_CXX_STANDARD 17)
+else()
+    set(CMAKE_CXX_STANDARD 11)
+endif()
 
-set(CMAKE_CXX_STANDARD 17)
 set(CMAKE_CXX_STANDARD_REQUIRED true)
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_C_STANDARD_REQUIRED true)
@@ -452,7 +455,6 @@ if (LLAMA_SYCL)
     if ( NOT DEFINED ENV{ONEAPI_ROOT})
 	message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
     endif()
-    
     #todo: AOT
 
     find_package(IntelSYCL REQUIRED)
@@ -464,7 +466,7 @@ if (LLAMA_SYCL)
     add_compile_options(-I./) #include DPCT 
     add_compile_options(-I/${SYCL_INCLUDE_DIR})
 
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17 -Wno-narrowing")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
 
diff --git a/README_sycl.md b/README_sycl.md
index 993155071e191..c81fc70294fef 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -120,8 +120,9 @@ cd build
 source /opt/intel/oneapi/setvars.sh
 
 #for FP16
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
 
+#for FP32
 cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
 #build example/main only
@@ -135,9 +136,13 @@ cmake --build . --config Release -v
 or
 
 ```
-./sycl_build.sh
+./examples/sycl/build.sh
 ```
 
+Note:
+
+- By default, it will build for all binary files. It will take more time. To reduce the time, we recommend to build for **example/main** only.
+
 ### Run
 
 1. Put model file to folder **models**
@@ -190,9 +195,14 @@ GGML_SYCL_DEVICE=0 && ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Buildi
 or run by script:
 
 ```
-./examples/sycl_run_llama2.sh
+./examples/sycl/run_llama2.sh
 ```
 
+Note:
+
+- By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue.
+
+
 5. Check the device ID in output
 
 Like：
@@ -207,11 +217,10 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 
 |Name|Value|Function|
 |-|-|-|
-|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path|
+|LLAMA_SYCL|ON (mandatory)|Enable build with SYCL code path. <br>For FP32/FP16, LLAMA_SYCL=ON is mandatory.|
+|LLAMA_SYCL_F16|ON (optional)|Enable FP16 build with SYCL code path. Faster for long-prompt inference. <br>For FP32, not set it.|
 |CMAKE_C_COMPILER|icx|Use icx compiler for SYCL code path|
 |CMAKE_CXX_COMPILER|icpx|use icpx for SYCL code path|
-|GGML_SYCL_F16|OFF (default) or ON|Enable FP16 in computing|
-
 
 #### Running
 
@@ -223,9 +232,17 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
 
 ## Known Issue
 
--  Hang during startup
+- Error:  `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.
+
+  Miss to enable oneAPI running environment.
+
+  Install oneAPI base toolkit and enable it by: `source /opt/intel/oneapi/setvars.sh`.
+
+
+- Hang during startup
 
   llama.cpp use mmap as default way to read model file and copy to GPU. In some system, memcpy will be abnormal and block.
+
   Solution: add **--no-mmap**.
 
 ## Todo
diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh
new file mode 100755
index 0000000000000..5556f3c6c1edb
--- /dev/null
+++ b/examples/sycl/build.sh
@@ -0,0 +1,15 @@
+mkdir -p build
+cd build
+source /opt/intel/oneapi/setvars.sh
+
+#for FP16
+#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON # faster for long-prompt inference
+
+#for FP32
+cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+
+#build example/main only
+#cmake --build . --config Release --target main
+
+#build all binary
+cmake --build . --config Release -v
diff --git a/examples/sycl/run_llama2.sh b/examples/sycl/run_llama2.sh
new file mode 100755
index 0000000000000..045e01a9435b8
--- /dev/null
+++ b/examples/sycl/run_llama2.sh
@@ -0,0 +1,14 @@
+
+INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
+source /opt/intel/oneapi/setvars.sh
+
+if [ $# -gt 0 ]; then
+  export GGML_SYCL_DEVICE=$1
+else
+  export GGML_SYCL_DEVICE=0
+fi
+echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
+#export GGML_SYCL_DEBUG=1
+./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
+#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
+

From 97cbe18dd280e69b7bbd108c42e02803e3d90b62 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Tue, 23 Jan 2024 14:35:33 +0800
Subject: [PATCH 44/90] rename macro to intel hardware

---
 ggml-sycl.cpp | 110 +++++++++++++++++++++++++-------------------------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 4995115551c92..0f2edf27fac42 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -45,10 +45,10 @@ static int g_work_group_size = 0;
 // typedef sycl::half ggml_fp16_t;
 
 #define __SYCL_ARCH__ DPCT_COMPATIBILITY_TEMP
-#define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
-#define CC_VOLTA      700
-#define CC_OFFSET_AMD 1000000
-#define CC_RDNA2      (CC_OFFSET_AMD + 1030)
+#define VER_4VEC   610          //todo for hardward optimize.
+#define VER_GEN9      700       //todo for hardward optimize.
+#define VER_GEN12 1000000       //todo for hardward optimize.
+#define VER_GEN13      (VER_GEN12 + 1030)   //todo for hardward optimize.
 
 #define GGML_SYCL_MAX_NODES 8192 //TODO: adapt to hardwares
 
@@ -3525,7 +3525,7 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
 
 #else
 
-#if __SYCL_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
     const block_q4_K * bq4_K = (const block_q4_K *) vbq;
 
     float sumf_d = 0.0f;
@@ -3565,7 +3565,7 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
 
 #else
     bad_arch();
-#endif // __SYCL_ARCH__ >= MIN_CC_DP4A
+#endif // __SYCL_ARCH__ >= VER_4VEC
 
 #endif
 }
@@ -3718,7 +3718,7 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
 
 #else
 
-#if __SYCL_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
+#if __SYCL_ARCH__ >= VER_4VEC // lowest compute capability for integer intrinsics
     const block_q5_K * bq5_K = (const block_q5_K *) vbq;
 
     const int8_t * s = bq5_K->scales;
@@ -3754,7 +3754,7 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
 
 #else
     bad_arch();
-#endif // __SYCL_ARCH__ >= MIN_CC_DP4A
+#endif // __SYCL_ARCH__ >= VER_4VEC
 
 #endif
 }
@@ -6501,19 +6501,19 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q4_0_RDNA2;
         mmq_y  =  MMQ_Y_Q4_0_RDNA2;
         nwarps = NWARPS_Q4_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q4_0_RDNA1;
         mmq_y  =  MMQ_Y_Q4_0_RDNA1;
         nwarps = NWARPS_Q4_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q4_0_AMPERE;
         mmq_y  =  MMQ_Y_Q4_0_AMPERE;
         nwarps = NWARPS_Q4_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q4_0_PASCAL;
         mmq_y  =  MMQ_Y_Q4_0_PASCAL;
         nwarps = NWARPS_Q4_0_PASCAL;
@@ -6616,19 +6616,19 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q4_1_RDNA2;
         mmq_y  =  MMQ_Y_Q4_1_RDNA2;
         nwarps = NWARPS_Q4_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q4_1_RDNA1;
         mmq_y  =  MMQ_Y_Q4_1_RDNA1;
         nwarps = NWARPS_Q4_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q4_1_AMPERE;
         mmq_y  =  MMQ_Y_Q4_1_AMPERE;
         nwarps = NWARPS_Q4_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q4_1_PASCAL;
         mmq_y  =  MMQ_Y_Q4_1_PASCAL;
         nwarps = NWARPS_Q4_1_PASCAL;
@@ -6731,19 +6731,19 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q5_0_RDNA2;
         mmq_y  =  MMQ_Y_Q5_0_RDNA2;
         nwarps = NWARPS_Q5_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q5_0_RDNA1;
         mmq_y  =  MMQ_Y_Q5_0_RDNA1;
         nwarps = NWARPS_Q5_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q5_0_AMPERE;
         mmq_y  =  MMQ_Y_Q5_0_AMPERE;
         nwarps = NWARPS_Q5_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q5_0_PASCAL;
         mmq_y  =  MMQ_Y_Q5_0_PASCAL;
         nwarps = NWARPS_Q5_0_PASCAL;
@@ -6846,19 +6846,19 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q5_1_RDNA2;
         mmq_y  =  MMQ_Y_Q5_1_RDNA2;
         nwarps = NWARPS_Q5_1_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q5_1_RDNA1;
         mmq_y  =  MMQ_Y_Q5_1_RDNA1;
         nwarps = NWARPS_Q5_1_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q5_1_AMPERE;
         mmq_y  =  MMQ_Y_Q5_1_AMPERE;
         nwarps = NWARPS_Q5_1_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q5_1_PASCAL;
         mmq_y  =  MMQ_Y_Q5_1_PASCAL;
         nwarps = NWARPS_Q5_1_PASCAL;
@@ -6961,19 +6961,19 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q8_0_RDNA2;
         mmq_y  =  MMQ_Y_Q8_0_RDNA2;
         nwarps = NWARPS_Q8_0_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q8_0_RDNA1;
         mmq_y  =  MMQ_Y_Q8_0_RDNA1;
         nwarps = NWARPS_Q8_0_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q8_0_AMPERE;
         mmq_y  =  MMQ_Y_Q8_0_AMPERE;
         nwarps = NWARPS_Q8_0_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q8_0_PASCAL;
         mmq_y  =  MMQ_Y_Q8_0_PASCAL;
         nwarps = NWARPS_Q8_0_PASCAL;
@@ -7076,19 +7076,19 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q2_K_RDNA2;
         mmq_y  =  MMQ_Y_Q2_K_RDNA2;
         nwarps = NWARPS_Q2_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q2_K_RDNA1;
         mmq_y  =  MMQ_Y_Q2_K_RDNA1;
         nwarps = NWARPS_Q2_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q2_K_AMPERE;
         mmq_y  =  MMQ_Y_Q2_K_AMPERE;
         nwarps = NWARPS_Q2_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q2_K_PASCAL;
         mmq_y  =  MMQ_Y_Q2_K_PASCAL;
         nwarps = NWARPS_Q2_K_PASCAL;
@@ -7199,19 +7199,19 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q3_K_RDNA2;
         mmq_y  =  MMQ_Y_Q3_K_RDNA2;
         nwarps = NWARPS_Q3_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q3_K_RDNA1;
         mmq_y  =  MMQ_Y_Q3_K_RDNA1;
         nwarps = NWARPS_Q3_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q3_K_AMPERE;
         mmq_y  =  MMQ_Y_Q3_K_AMPERE;
         nwarps = NWARPS_Q3_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q3_K_PASCAL;
         mmq_y  =  MMQ_Y_Q3_K_PASCAL;
         nwarps = NWARPS_Q3_K_PASCAL;
@@ -7327,19 +7327,19 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q4_K_RDNA2;
         mmq_y  =  MMQ_Y_Q4_K_RDNA2;
         nwarps = NWARPS_Q4_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q4_K_RDNA1;
         mmq_y  =  MMQ_Y_Q4_K_RDNA1;
         nwarps = NWARPS_Q4_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q4_K_AMPERE;
         mmq_y  =  MMQ_Y_Q4_K_AMPERE;
         nwarps = NWARPS_Q4_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q4_K_PASCAL;
         mmq_y  =  MMQ_Y_Q4_K_PASCAL;
         nwarps = NWARPS_Q4_K_PASCAL;
@@ -7448,19 +7448,19 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q5_K_RDNA2;
         mmq_y  =  MMQ_Y_Q5_K_RDNA2;
         nwarps = NWARPS_Q5_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q5_K_RDNA1;
         mmq_y  =  MMQ_Y_Q5_K_RDNA1;
         nwarps = NWARPS_Q5_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q5_K_AMPERE;
         mmq_y  =  MMQ_Y_Q5_K_AMPERE;
         nwarps = NWARPS_Q5_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q5_K_PASCAL;
         mmq_y  =  MMQ_Y_Q5_K_PASCAL;
         nwarps = NWARPS_Q5_K_PASCAL;
@@ -7569,19 +7569,19 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy,
     const int compute_capability = g_device_caps[id].cc;
 
     int mmq_x, mmq_y, nwarps;
-    if (compute_capability >= CC_RDNA2) {
+    if (compute_capability >= VER_GEN13) {
         mmq_x  =  MMQ_X_Q6_K_RDNA2;
         mmq_y  =  MMQ_Y_Q6_K_RDNA2;
         nwarps = NWARPS_Q6_K_RDNA2;
-    } else if (compute_capability >= CC_OFFSET_AMD) {
+    } else if (compute_capability >= VER_GEN12) {
         mmq_x  =  MMQ_X_Q6_K_RDNA1;
         mmq_y  =  MMQ_Y_Q6_K_RDNA1;
         nwarps = NWARPS_Q6_K_RDNA1;
-    } else if (compute_capability >= CC_VOLTA) {
+    } else if (compute_capability >= VER_GEN9) {
         mmq_x  =  MMQ_X_Q6_K_AMPERE;
         mmq_y  =  MMQ_Y_Q6_K_AMPERE;
         nwarps = NWARPS_Q6_K_AMPERE;
-    } else if (compute_capability >= MIN_CC_DP4A) {
+    } else if (compute_capability >= VER_4VEC) {
         mmq_x  =  MMQ_X_Q6_K_PASCAL;
         mmq_y  =  MMQ_Y_Q6_K_PASCAL;
         nwarps = NWARPS_Q6_K_PASCAL;
@@ -9118,7 +9118,7 @@ static int64_t get_row_rounding(ggml_type type) {
     switch(type) {
         case GGML_TYPE_Q4_0:
         case GGML_TYPE_Q4_1:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+            return max_compute_capability >= VER_GEN9 ? 128 : 64;
         case GGML_TYPE_Q5_0:
         case GGML_TYPE_Q5_1:
         case GGML_TYPE_Q8_0:
@@ -9130,7 +9130,7 @@ static int64_t get_row_rounding(ggml_type type) {
         case GGML_TYPE_Q3_K:
         case GGML_TYPE_Q4_K:
         case GGML_TYPE_Q5_K:
-            return max_compute_capability >= CC_VOLTA ? 128 : 64;
+            return max_compute_capability >= VER_GEN9 ? 128 : 64;
         case GGML_TYPE_Q6_K:
             return 64;
         default:
@@ -9302,7 +9302,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
 #else
     bool use_fp16 = false;
 #endif
-    // if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 ||
+    // if (compute_capability >= VER_GEN9 && (src0->type == GGML_TYPE_F16 ||
     // ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff ==
     // src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) {
     if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
@@ -10610,7 +10610,7 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
 #ifdef GGML_SYCL_FORCE_DMMV
             const bool use_mul_mat_vec_q = false;
 #else
-            const bool use_mul_mat_vec_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
+            const bool use_mul_mat_vec_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type) && ggml_nrows(src1) == 1;
 #endif // GGML_SYCL_FORCE_DMMV
 
             if (use_mul_mat_vec_q) {
@@ -10622,9 +10622,9 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
                 ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
             }
         } else {
-            bool use_mul_mat_q = min_compute_capability >= MIN_CC_DP4A && ggml_is_quantized(src0->type);
+            bool use_mul_mat_q = min_compute_capability >= VER_4VEC && ggml_is_quantized(src0->type);
 
-            if (use_xmx && min_compute_capability >= CC_VOLTA && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
+            if (use_xmx && min_compute_capability >= VER_GEN9 && src1->ne[1] > XMX_MAX_BATCH_SIZE) {
                 use_mul_mat_q = false;
             }
 
@@ -11880,7 +11880,7 @@ static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_t
 }
 
 static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
-    /* .get_name         = */ ggml_backend_sycl_buffer_type_name,    
+    /* .get_name         = */ ggml_backend_sycl_buffer_type_name,
     /* .alloc_buffer     = */ ggml_backend_sycl_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_sycl_buffer_type_get_alignment,
     /* .get_alloc_size   = */ ggml_backend_sycl_buffer_type_get_alloc_size,

From 1ddaf44c30f607822414dcc6afa0f85f28362cbb Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 23 Jan 2024 01:03:34 -0800
Subject: [PATCH 45/90] editor config format

---
 CMakeLists.txt               | 4 ++--
 examples/server/server.cpp   | 2 +-
 examples/sycl/CMakeLists.txt | 2 +-
 ggml-sycl.h                  | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6d2168250b38b..207a8e4dc14bd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -453,7 +453,7 @@ endif()
 
 if (LLAMA_SYCL)
     if ( NOT DEFINED ENV{ONEAPI_ROOT})
-	message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+    message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
     endif()
     #todo: AOT
 
@@ -463,7 +463,7 @@ if (LLAMA_SYCL)
     endif()
     add_compile_definitions(GGML_USE_SYCL)
 
-    add_compile_options(-I./) #include DPCT 
+    add_compile_options(-I./) #include DPCT
     add_compile_options(-I/${SYCL_INCLUDE_DIR})
 
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index bebc211d43ea8..c97ad70527ba3 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2319,7 +2319,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
                 invalid_param = true;
                 break;
             }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) 
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
             std::string arg_next = argv[i];
 
             // split string by , and /
diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index 8404704792520..ad759ae80fea5 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -2,4 +2,4 @@ set(TARGET ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
\ No newline at end of file
+target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/ggml-sycl.h b/ggml-sycl.h
index 3dd2bc20533ea..f86703456fde6 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -2858,4 +2858,4 @@ namespace dpct
 
 } // COPY from DPCT head files
 
-#endif
\ No newline at end of file
+#endif

From bd716b25940651832854068231b6dcdcfbc30665 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:45:36 +0530
Subject: [PATCH 46/90] format fixes

---
 README_sycl.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/README_sycl.md b/README_sycl.md
index c81fc70294fef..f057f06ef1b27 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -170,13 +170,13 @@ Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
   Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
   Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-	max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
   Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-	max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
   Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 
 ```
 

From be31379ef835382a8c70548f35b5080be6727557 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:47:30 +0530
Subject: [PATCH 47/90] format fixes


From d097e2a4ef832147595e42eed0085ef9a948b7dc Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Tue, 23 Jan 2024 17:32:42 +0530
Subject: [PATCH 48/90] editor format fix


From 88f64b7d3db93c73110d99b47eb40727eb52446d Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Tue, 23 Jan 2024 17:34:57 +0530
Subject: [PATCH 49/90] Remove unused headers

---
 examples/sycl/ls-sycl-device.cpp | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp
index 619bee9de33ec..f1178396a4c89 100644
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -1,19 +1,3 @@
-/*
- * #include "common.h"
-
-#include "console.h"
-#include <cassert>
-#include <cinttypes>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <string>
-#include <vector>
-*/
 
 #include "ggml-sycl.h"
 

From 756c4accaf6dbf41a61caf80bfdb392efbc60f7b Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Tue, 23 Jan 2024 20:06:08 +0800
Subject: [PATCH 50/90] skip build sycl tool for other code path

---
 examples/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 6b1d24e42d169..68ad899648137 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -23,7 +23,9 @@ else()
     add_subdirectory(infill)
     add_subdirectory(llama-bench)
     add_subdirectory(llava)
-    add_subdirectory(sycl)
+    if (LLAMA_SYCL)
+        add_subdirectory(sycl)
+    endif()
     add_subdirectory(main)
     add_subdirectory(tokenize)
     add_subdirectory(parallel)

From b42a32d31a1270d29a333a0bab3f8065122feb97 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Tue, 23 Jan 2024 20:20:16 +0800
Subject: [PATCH 51/90] replace tab by space

---
 examples/sycl/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/sycl/README.md b/examples/sycl/README.md
index 6b38bb8a8420a..b46f17f39efd4 100644
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -31,13 +31,13 @@ Check the ID in startup log, like:
 ```
 found 4 SYCL devices:
   Device 0: Intel(R) Arc(TM) A770 Graphics,	compute capability 1.3,
-	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
   Device 1: Intel(R) FPGA Emulation Device,	compute capability 1.2,
-	max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
+    max compute_units 24,	max work group size 67108864,	max sub group size 64,	global mem size 67065057280
   Device 2: 13th Gen Intel(R) Core(TM) i7-13700K,	compute capability 3.0,
-	max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
+    max compute_units 24,	max work group size 8192,	max sub group size 64,	global mem size 67065057280
   Device 3: Intel(R) Arc(TM) A770 Graphics,	compute capability 3.0,
-	max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
+    max compute_units 512,	max work group size 1024,	max sub group size 32,	global mem size 16225243136
 
 ```
 

From 5f83a12382dc4829e849f0cfcb8c7fab1a8d62dd Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Tue, 23 Jan 2024 17:56:37 +0530
Subject: [PATCH 52/90] fix blas matmul function

---
 ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 67c791b27dad6..1a261fb4b2265 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9958,7 +9958,7 @@ static void ggml_compute_forward_mul_mat(
 #endif
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
+    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         const int64_t ne_plane      = ne01*ne00;
         const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
         UNUSED(desired_wsize);

From d6fc1a0309480242681db5d41cb0ce72ae8cfc66 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 23 Jan 2024 07:19:30 -0800
Subject: [PATCH 53/90] fix mac build

---
 ggml.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml.c b/ggml.c
index 1a261fb4b2265..8368ada713f8f 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9958,7 +9958,7 @@ static void ggml_compute_forward_mul_mat(
 #endif
 
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+    if (ggml_compute_forward_mul_mat_use_blas(dst)) {
         const int64_t ne_plane      = ne01*ne00;
         const int64_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
         UNUSED(desired_wsize);
@@ -16587,7 +16587,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
                 }
 #endif
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                if (ggml_compute_forward_mul_mat_use_blas(node->src[0], node->src[1], node)) {
+                if (ggml_compute_forward_mul_mat_use_blas(node)) {
                     n_tasks = 1; // TODO: this actually is doing nothing
                                  //       the threads are still spinning
                 }

From c7e745e6f36cca5a8bd1f1caf792288cd1bee9d9 Mon Sep 17 00:00:00 2001
From: abhilash1910 <abhilash.majumder@intel.com>
Date: Tue, 23 Jan 2024 07:35:31 -0800
Subject: [PATCH 54/90] restore hip dependency

---
 CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 207a8e4dc14bd..f70a3904f8aa7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,6 +97,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                              "llama: max. batch size for using peer access")
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
+option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
 option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)

From 3bfb846d6a91917b8ac334c48fcb6ba0cf8be81e Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Tue, 23 Jan 2024 23:45:56 +0800
Subject: [PATCH 55/90] fix conflict

---
 ggml.c        | 24 ------------------------
 ggml.h        |  2 +-
 llama.cpp     | 22 +++++++---------------
 sycl_build.sh | 14 --------------
 4 files changed, 8 insertions(+), 54 deletions(-)
 delete mode 100755 sycl_build.sh

diff --git a/ggml.c b/ggml.c
index 8368ada713f8f..48f0eb7d081b8 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14696,8 +14696,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
     if (skip_cpu) {
         return;
     }
-    GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
-    GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
 #endif // GGML_USE_SYCL
     switch (tensor->op) {
         case GGML_OP_DUP:
@@ -16570,28 +16568,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
 
                 //n_tasks = MIN(n_threads, MAX(1, nr0/128));
                 //printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
-
-#if defined(GGML_USE_CUBLAS)
-                if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
-                    n_tasks = 1; // TODO: this actually is doing nothing
-                                 //       the threads are still spinning
-                }
-#elif defined(GGML_USE_CLBLAST)
-                if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
-                    n_tasks = 1; // TODO: this actually is doing nothing
-                                 //       the threads are still spinning
-                }
-#elif defined(GGML_USE_SYCL)
-                if (ggml_sycl_can_mul_mat(node->src[0], node->src[1], node)) {
-                    n_tasks = 1;
-                }
-#endif
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                if (ggml_compute_forward_mul_mat_use_blas(node)) {
-                    n_tasks = 1; // TODO: this actually is doing nothing
-                                 //       the threads are still spinning
-                }
-#endif
             } break;
         case GGML_OP_MUL_MAT_ID:
             {
diff --git a/ggml.h b/ggml.h
index 5a173d362c79f..a06e27c629303 100644
--- a/ggml.h
+++ b/ggml.h
@@ -2284,7 +2284,7 @@ extern "C" {
     typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int k);
     typedef void (*ggml_vec_dot_t)   (const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT x, const void * GGML_RESTRICT y);
 
-    typedef struct dpct_type_994041 {
+    typedef struct {
         const char      * type_name;
         int               blck_size;
         size_t            type_size;
diff --git a/llama.cpp b/llama.cpp
index d3c9eaa79a9ed..ce74fb557aff1 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -9791,14 +9791,6 @@ struct llama_model * llama_load_model_from_file(
               struct llama_model_params   params) {
     ggml_time_init();
 
-#ifdef GGML_USE_SYCL
-    int main_device = get_main_device();
-    if(main_device>=0) params.main_gpu = main_device;
-    else {
-        LLAMA_LOG_ERROR("%s: missed to init GPU device\n", __func__);
-        std::exit(1);
-    }
-#endif
     llama_model * model = new llama_model;
 
     unsigned cur_percentage = 0;
@@ -9939,13 +9931,13 @@ struct llama_context * llama_new_context_with_model(
         }
 #elif defined(GGML_USE_SYCL)
         if (model->n_gpu_layers > 0) {
-                ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
-                if (backend == nullptr) {
-                    LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
-                    llama_free(ctx);
-                    return nullptr;
-                }
-                ctx->backends.push_back(backend);
+            ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu);
+            if (backend == nullptr) {
+                LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu);
+                llama_free(ctx);
+                return nullptr;
+            }
+            ctx->backends.push_back(backend);
         }
 #endif
         ctx->backend_cpu = ggml_backend_cpu_init();
diff --git a/sycl_build.sh b/sycl_build.sh
deleted file mode 100755
index 8f2e1d22da08d..0000000000000
--- a/sycl_build.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-mkdir -p build
-cd build
-source /opt/intel/oneapi/setvars.sh
-
-#for FP16
-#cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
-
-cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
-
-#build example/main only
-#cmake --build . --config Release --target main
-
-#build all binary
-cmake --build . --config Release -v

From 498121b11f1899f938a81bc8dccb86df8cd80953 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 09:20:42 +0800
Subject: [PATCH 56/90] ren as review comments

---
 .../sycl/{run_llama2.sh => run-llama2.sh}     |  0
 examples/sycl_run_llama2.sh                   | 19 -------------------
 2 files changed, 19 deletions(-)
 rename examples/sycl/{run_llama2.sh => run-llama2.sh} (100%)
 delete mode 100755 examples/sycl_run_llama2.sh

diff --git a/examples/sycl/run_llama2.sh b/examples/sycl/run-llama2.sh
similarity index 100%
rename from examples/sycl/run_llama2.sh
rename to examples/sycl/run-llama2.sh
diff --git a/examples/sycl_run_llama2.sh b/examples/sycl_run_llama2.sh
deleted file mode 100755
index 6e08fac7d9a4b..0000000000000
--- a/examples/sycl_run_llama2.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-INPUT1="The process of Origami seems simple at the first glance, but in fact, it still requires a very complicated process to do it well. Taking folding a rose as an example, we can divide the entire process into three stages, including: firstly creating a grid of creases, secondly making a three-dimensional base, and thirdly finishing petal decoration. The first step is to create a grid of creases: this step is a bit like the first step of folding a gift of thousand-paper-crane. That is to say, we can fold the paper in half (or namedly equal-folds) through the symmetrical axis, and repeat such step in the other symmetrical axis. And then apply multiple equal-folds in sequence relative to each smaller rectangle divided by the two creases; After that, the creases in each direction will interweave into a complete set of uniform small square splicing patterns; these small squares form a reference space similar to a two-dimensional coordinate system, allowing us to combine adjacent creases on the plane from Three-dimensional high platforms or depressions are folded on the two-dimensional small squares to facilitate the next steps of folding. It should be noted that, in the process of creating grid creases, there may be rare cases when the folds are not aligned. The consequences of this error can be very serious. And just like the butterfly effect, it is only a slight difference at the beginning , and in the end it may generate a disaster world which is completely different from plan. Anyway, let's continue. The second step is make the three-dimensional base: In this step, we need to fold a set of symmetrical three-dimensional high platforms or depressions based on the grid creases. From the symmetry analysis, it is not difficult to find that the rose will have four symmetrical three-dimensional high platforms and supporting depressions. Therefore, we can firstly fold out a quarter of the depression and plateau patterns, which would help build a base to compose into a complex 3D structure. And then, we use this quarter as a template, and fold out the repeating patterns on the remaining three parts of the whole structure in turn. It is worth noting that the layout of the high platform not only needs to consider the regular contrast and symmetrical distribution of the length and width, but also needs to ensure the orderliness of the height dimension. This is very important, since we will never go back to this step after all parts were made, and you would better start from first step if you make anything wrong in the this step. Similar to the precautions in the first stage, please handle all the corners in three dimensions to ensure that they conform to the layout required in the plan, which would help us avoid the butterfly effect and increase the robustness in the process of three-dimensional folding. Just like building a skyscrapper in the real world, people usually take a lot of time when building the base but soon get finished when extending the structure after that. Time is worth to cost in the base, but would be saved in the future after you succeed in base. Anyway, let's continue. During the first quarter of the pattern, repeated comparisons with the finished rose were made to eliminate any possible errors in the first place. The final stage is to finish the petal grooming. At this stage, we often emphasize an important term called folding-by-heart. The intention here is no longer literally serious, but focus is moved to our understanding of the shape of a rose in nature, and we usually use natural curves to continuously correct the shape of petals in order to approach the shape of rose petals in reality. One more comment: this is also the cause of randomness to the art, which can be generated differently by different people. Recall that rose should be adjusted close to reality, so in the last step of this stage, we need to open the bloom in the center of the rose, by pulling on the four petals that have been bent. This process may be accompanied by the collapse of the overall structure of the rose, so we should be very careful to save strength of adjustment, and it must be well controlled to avoid irreversible consequences. Ultimately, after three stages of folding, we end up with a crown of rose with a similar shape close to reality. If condition is permited, we can wrap a green paper strip twisted on a straightened iron wire, and insert the rose crown we just created onto one side of the iron wire. In this way, we got a hand-made rose with a green stem. We can also repeat the steps above to increase the number of rose, so that it can be made into a cluster. Different color of rose is usually more attractive and can be considered as a better plan of gift to your friend. In summary, by creating a grid of creases, making a three-dimensional base, and finishing with petals, we created a three-dimensional rose from a two-dimensional paper. Although this process may seem simple, it is indeed a work of art created by us humans with the help of imagination and common materials. At last, Please comment to assess the above content."
-
-#echo ${INPUT1}
-
-INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
-source /opt/intel/oneapi/setvars.sh
-
-if [ $# -gt 0 ]; then
-  export GGML_SYCL_DEVICE=$1
-else
-  export GGML_SYCL_DEVICE=0
-fi
-echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
-#export GGML_SYCL_DEBUG=1
-#export GGML_SYCL_LIST_DEVICE=1
-#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT1}" -e -n 400 -ngl 33 -c 2048
-./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0
-#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0
-

From 91b1461030c1a3763560977728b47659aaf0c481 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 09:41:25 +0800
Subject: [PATCH 57/90] mv internal function to .cpp file

---
 ggml-sycl.cpp | 24 ++++++++++++++++++++++++
 ggml-sycl.h   | 37 -------------------------------------
 2 files changed, 24 insertions(+), 37 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 0f2edf27fac42..6f9261a5e5c68 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -98,6 +98,30 @@ typedef float dfloat; // dequantize float
 typedef sycl::float2 dfloat2;
 #endif //GGML_SYCL_F16
 
+bool   ggml_sycl_loaded(void);
+void * ggml_sycl_host_malloc(size_t size);
+void   ggml_sycl_host_free(void * ptr);
+bool   ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_sycl_set_tensor_split(const float * tensor_split);
+void   ggml_sycl_transform_tensor(void * data, struct ggml_tensor * tensor);
+void   ggml_sycl_free_data(struct ggml_tensor * tensor);
+void   ggml_sycl_assign_buffers(struct ggml_tensor * tensor);
+void   ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor);
+void   ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor);
+void   ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor);
+void   ggml_sycl_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
+void   ggml_sycl_copy_to_device(struct ggml_tensor * tensor);
+void   ggml_sycl_set_main_device(int main_device);
+void   ggml_sycl_set_mul_mat_q(bool mul_mat_q);
+void   ggml_sycl_set_scratch_size(size_t scratch_size);
+void   ggml_sycl_free_scratch(void);
+int    ggml_sycl_get_device_count(void);
+void   ggml_sycl_get_device_description(int device, char * description, size_t description_size);
+bool   ggml_backend_is_sycl(ggml_backend_t backend);
+int    ggml_backend_sycl_get_device(ggml_backend_t backend);
+int    get_main_device();
+void   print_ggml_tensor(const char*name, struct ggml_tensor *src);
+void   log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt);
 
 static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) {
     const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
diff --git a/ggml-sycl.h b/ggml-sycl.h
index f86703456fde6..1ad1b3737d95a 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -11,52 +11,15 @@ extern "C" {
 #define GGML_SYCL_NAME "SYCL"
 
 GGML_API void   ggml_init_sycl(void);
-
-GGML_API bool   ggml_sycl_loaded(void);
-
-GGML_API void * ggml_sycl_host_malloc(size_t size);
-GGML_API void   ggml_sycl_host_free(void * ptr);
-
-GGML_API bool   ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
-GGML_API void   ggml_sycl_set_tensor_split(const float * tensor_split);
-GGML_API void   ggml_sycl_transform_tensor(void * data, struct ggml_tensor * tensor);
-GGML_API void   ggml_sycl_free_data(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_sycl_assign_buffers(struct ggml_tensor * tensor);
-GGML_API void   ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor);
-GGML_API void   ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor);
-GGML_API void   ggml_sycl_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset);
-GGML_API void   ggml_sycl_copy_to_device(struct ggml_tensor * tensor);
-
-GGML_API void   ggml_sycl_set_main_device(int main_device);
-GGML_API void   ggml_sycl_set_mul_mat_q(bool mul_mat_q);
-GGML_API void   ggml_sycl_set_scratch_size(size_t scratch_size);
-GGML_API void   ggml_sycl_free_scratch(void);
 GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
-
-GGML_API int    ggml_sycl_get_device_count(void);
-GGML_API void   ggml_sycl_get_device_description(int device, char * description, size_t description_size);
-
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
-
-GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
-GGML_API int  ggml_backend_sycl_get_device(ggml_backend_t backend);
-
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
-
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
 
-int get_main_device();
-void print_ggml_tensor(const char*name, struct ggml_tensor *src);
-void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt);
-
 #ifdef  __cplusplus
 }
 #endif
 
-
 /*
 Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
 */

From 816f480e986631b799ca015ec0bb87f5877a0d76 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 10:10:45 +0800
Subject: [PATCH 58/90] export funciton print_sycl_devices(), mv class dpct
 definition to source file

---
 examples/sycl/ls-sycl-device.cpp |   40 +-
 ggml-sycl.cpp                    | 2812 +++++++++++++++++++++++++++++-
 ggml-sycl.h                      | 2806 +----------------------------
 3 files changed, 2806 insertions(+), 2852 deletions(-)

diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp
index f1178396a4c89..8623314319515 100644
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -1,44 +1,6 @@
-
 #include "ggml-sycl.h"
 
-#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
-#include <signal.h>
-#include <unistd.h>
-#elif defined (_WIN32)
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#include <signal.h>
-#endif
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-
-
-void print_sycl_devices(){
-    int device_count = dpct::dev_mgr::instance().device_count();
-    fprintf(stderr, "found %d SYCL devices:\n", device_count);
-    for (int id = 0; id < device_count; ++id) {
-        dpct::device_info prop;
-        dpct::get_device_info(
-            prop, dpct::dev_mgr::instance().get_device(id));
-        sycl::device cur_device = dpct::dev_mgr::instance().get_device(id);
-        fprintf(stderr, "  Device %d: %s,\tcompute capability %d.%d,\n\tmax compute_units %d,\tmax work group size %d,\tmax sub group size %d,\tglobal mem size %lu\n", id,
-                prop.get_name(), prop.get_major_version(),
-                prop.get_minor_version(),
-                prop.get_max_compute_units(),
-                prop.get_max_work_group_size(),
-                prop.get_max_sub_group_size(),
-                prop.get_global_mem_size()
-                );
-    }
-}
-
 int main(int argc, char ** argv) {
     print_sycl_devices();
     return 0;
-}
+}
\ No newline at end of file
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 6f9261a5e5c68..44ba952005669 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -24,6 +24,2807 @@
 #include "ggml.h"
 #include "ggml-backend-impl.h"
 
+/*
+Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
+*/
+// COPY from DPCT head files
+#include <sycl/sycl.hpp>
+#include <oneapi/mkl.hpp>
+#include <map>
+
+#if defined(__linux__)
+#include <sys/mman.h>
+#elif defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#else
+#error "Only support Windows and Linux."
+#endif
+
+#if defined(__linux__)
+#include <unistd.h>
+#include <sys/syscall.h>
+#endif
+#if defined(_WIN64)
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include <windows.h>
+#endif
+
+#define DPCT_COMPATIBILITY_TEMP (900)
+
+#if defined(_MSC_VER)
+#define __dpct_align__(n) __declspec(align(n))
+#define __dpct_inline__ __forceinline
+#else
+#define __dpct_align__(n) __attribute__((aligned(n)))
+#define __dpct_inline__ __inline__ __attribute__((always_inline))
+#endif
+
+#if defined(_MSC_VER)
+#define __dpct_noinline__ __declspec(noinline)
+#else
+#define __dpct_noinline__ __attribute__((noinline))
+#endif
+
+namespace dpct
+{
+    typedef sycl::queue *queue_ptr;
+    typedef sycl::event *event_ptr;
+    typedef char *device_ptr;
+    typedef uint8_t byte_t;
+    typedef sycl::buffer<byte_t> buffer_t;
+
+    /// SYCL default exception handler
+    inline auto exception_handler = [](sycl::exception_list exceptions)
+    {
+        for (std::exception_ptr const &e : exceptions)
+        {
+            try
+            {
+                std::rethrow_exception(e);
+            }
+            catch (sycl::exception const &e)
+            {
+                std::cerr << "Caught asynchronous SYCL exception:" << std::endl
+                          << e.what() << std::endl
+                          << "Exception caught at file:" << __FILE__
+                          << ", line:" << __LINE__ << std::endl;
+            }
+        }
+    };
+
+    enum error_code
+    {
+        success = 0,
+        default_error = 999
+    };
+
+    enum memcpy_direction
+    {
+        host_to_host,
+        host_to_device,
+        device_to_host,
+        device_to_device,
+        automatic
+    };
+
+    enum memory_region
+    {
+        global = 0, // device global memory
+        constant,   // device constant memory
+        local,      // device local memory
+        shared,     // memory which can be accessed by host and device
+    };
+
+    enum class library_data_t : unsigned char
+    {
+        real_float = 0,
+        complex_float,
+        real_double,
+        complex_double,
+        real_half,
+        complex_half,
+        real_bfloat16,
+        complex_bfloat16,
+        real_int4,
+        complex_int4,
+        real_uint4,
+        complex_uint4,
+        real_int8,
+        complex_int8,
+        real_uint8,
+        complex_uint8,
+        real_int16,
+        complex_int16,
+        real_uint16,
+        complex_uint16,
+        real_int32,
+        complex_int32,
+        real_uint32,
+        complex_uint32,
+        real_int64,
+        complex_int64,
+        real_uint64,
+        complex_uint64,
+        real_int8_4,
+        real_int8_32,
+        real_uint8_4,
+        library_data_t_size
+    };
+
+    template <typename T>
+    struct DataType
+    {
+        using T2 = T;
+    };
+    template <typename T>
+    struct DataType<sycl::vec<T, 2>>
+    {
+        using T2 = std::complex<T>;
+    };
+
+    static void destroy_event(event_ptr event)
+    {
+        delete event;
+    }
+
+    static inline unsigned int get_tid()
+    {
+#if defined(__linux__)
+        return syscall(SYS_gettid);
+#elif defined(_WIN64)
+        return GetCurrentThreadId();
+#else
+#error "Only support Windows and Linux."
+#endif
+    }
+
+    namespace detail
+    {
+        static void get_version(const sycl::device &dev, int &major, int &minor)
+        {
+            // Version string has the following format:
+            // a. OpenCL<space><major.minor><space><vendor-specific-information>
+            // b. <major.minor>
+            std::string ver;
+            ver = dev.get_info<sycl::info::device::version>();
+            std::string::size_type i = 0;
+            while (i < ver.size())
+            {
+                if (isdigit(ver[i]))
+                    break;
+                i++;
+            }
+            major = std::stoi(&(ver[i]));
+            while (i < ver.size())
+            {
+                if (ver[i] == '.')
+                    break;
+                i++;
+            }
+            i++;
+            minor = std::stoi(&(ver[i]));
+        }
+
+        template <typename tag, typename T>
+        class generic_error_type
+        {
+        public:
+            generic_error_type() = default;
+            generic_error_type(T value) : value{value} {}
+            operator T() const { return value; }
+
+        private:
+            T value;
+        };
+
+    } // namespace detail
+
+    /// Pitched 2D/3D memory data.
+    class pitched_data
+    {
+    public:
+        pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
+        pitched_data(void *data, size_t pitch, size_t x, size_t y)
+            : _data(data), _pitch(pitch), _x(x), _y(y) {}
+
+        void *get_data_ptr() { return _data; }
+        void set_data_ptr(void *data) { _data = data; }
+
+        size_t get_pitch() { return _pitch; }
+        void set_pitch(size_t pitch) { _pitch = pitch; }
+
+        size_t get_x() { return _x; }
+        void set_x(size_t x) { _x = x; };
+
+        size_t get_y() { return _y; }
+        void set_y(size_t y) { _y = y; }
+
+    private:
+        void *_data;
+        size_t _pitch, _x, _y;
+    };
+
+    class device_info
+    {
+    public:
+        // get interface
+        const char *get_name() const { return _name; }
+        char *get_name() { return _name; }
+        template <typename WorkItemSizesTy = sycl::range<3>,
+                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                       std::is_same_v<WorkItemSizesTy, int *>,
+                                   int> = 0>
+        auto get_max_work_item_sizes() const
+        {
+            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+                return sycl::range<3>(_max_work_item_sizes_i[0],
+                                      _max_work_item_sizes_i[1],
+                                      _max_work_item_sizes_i[2]);
+            else
+            {
+                return _max_work_item_sizes_i;
+            }
+        }
+        template <typename WorkItemSizesTy = sycl::range<3>,
+                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
+                                       std::is_same_v<WorkItemSizesTy, int *>,
+                                   int> = 0>
+        auto get_max_work_item_sizes()
+        {
+            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
+                return sycl::range<3>(_max_work_item_sizes_i[0],
+                                      _max_work_item_sizes_i[1],
+                                      _max_work_item_sizes_i[2]);
+            else
+            {
+                return _max_work_item_sizes_i;
+            }
+        }
+        bool get_host_unified_memory() const { return _host_unified_memory; }
+        int get_major_version() const { return _major; }
+        int get_minor_version() const { return _minor; }
+        int get_integrated() const { return _integrated; }
+        int get_max_clock_frequency() const { return _frequency; }
+        int get_max_compute_units() const { return _max_compute_units; }
+        int get_max_work_group_size() const { return _max_work_group_size; }
+        int get_max_sub_group_size() const { return _max_sub_group_size; }
+        int get_max_work_items_per_compute_unit() const
+        {
+            return _max_work_items_per_compute_unit;
+        }
+        int get_max_register_size_per_work_group() const
+        {
+            return _max_register_size_per_work_group;
+        }
+        template <typename NDRangeSizeTy = size_t *,
+                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                       std::is_same_v<NDRangeSizeTy, int *>,
+                                   int> = 0>
+        auto get_max_nd_range_size() const
+        {
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+                return _max_nd_range_size;
+            else
+                return _max_nd_range_size_i;
+        }
+        template <typename NDRangeSizeTy = size_t *,
+                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
+                                       std::is_same_v<NDRangeSizeTy, int *>,
+                                   int> = 0>
+        auto get_max_nd_range_size()
+        {
+            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
+                return _max_nd_range_size;
+            else
+                return _max_nd_range_size_i;
+        }
+        size_t get_global_mem_size() const { return _global_mem_size; }
+        size_t get_local_mem_size() const { return _local_mem_size; }
+        /// Returns the maximum clock rate of device's global memory in kHz. If
+        /// compiler does not support this API then returns default value 3200000 kHz.
+        unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
+        /// Returns the maximum bus width between device and memory in bits. If
+        /// compiler does not support this API then returns default value 64 bits.
+        unsigned int get_memory_bus_width() const { return _memory_bus_width; }
+        uint32_t get_device_id() const { return _device_id; }
+        std::array<unsigned char, 16> get_uuid() const { return _uuid; }
+        /// Returns global memory cache size in bytes.
+        unsigned int get_global_mem_cache_size() const
+        {
+            return _global_mem_cache_size;
+        }
+
+        // set interface
+        void set_name(const char *name)
+        {
+            size_t length = strlen(name);
+            if (length < 256)
+            {
+                std::memcpy(_name, name, length + 1);
+            }
+            else
+            {
+                std::memcpy(_name, name, 255);
+                _name[255] = '\0';
+            }
+        }
+        void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes)
+        {
+            for (int i = 0; i < 3; ++i)
+                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+        }
+        [[deprecated]] void
+        set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
+        {
+            for (int i = 0; i < 3; ++i)
+            {
+                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
+            }
+        }
+        void set_host_unified_memory(bool host_unified_memory)
+        {
+            _host_unified_memory = host_unified_memory;
+        }
+        void set_major_version(int major) { _major = major; }
+        void set_minor_version(int minor) { _minor = minor; }
+        void set_integrated(int integrated) { _integrated = integrated; }
+        void set_max_clock_frequency(int frequency) { _frequency = frequency; }
+        void set_max_compute_units(int max_compute_units)
+        {
+            _max_compute_units = max_compute_units;
+        }
+        void set_global_mem_size(size_t global_mem_size)
+        {
+            _global_mem_size = global_mem_size;
+        }
+        void set_local_mem_size(size_t local_mem_size)
+        {
+            _local_mem_size = local_mem_size;
+        }
+        void set_max_work_group_size(int max_work_group_size)
+        {
+            _max_work_group_size = max_work_group_size;
+        }
+        void set_max_sub_group_size(int max_sub_group_size)
+        {
+            _max_sub_group_size = max_sub_group_size;
+        }
+        void
+        set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
+        {
+            _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
+        }
+        void set_max_nd_range_size(int max_nd_range_size[])
+        {
+            for (int i = 0; i < 3; i++)
+            {
+                _max_nd_range_size[i] = max_nd_range_size[i];
+                _max_nd_range_size_i[i] = max_nd_range_size[i];
+            }
+        }
+        void set_memory_clock_rate(unsigned int memory_clock_rate)
+        {
+            _memory_clock_rate = memory_clock_rate;
+        }
+        void set_memory_bus_width(unsigned int memory_bus_width)
+        {
+            _memory_bus_width = memory_bus_width;
+        }
+        void
+        set_max_register_size_per_work_group(int max_register_size_per_work_group)
+        {
+            _max_register_size_per_work_group = max_register_size_per_work_group;
+        }
+        void set_device_id(uint32_t device_id)
+        {
+            _device_id = device_id;
+        }
+        void set_uuid(std::array<unsigned char, 16> uuid)
+        {
+            _uuid = std::move(uuid);
+        }
+        void set_global_mem_cache_size(unsigned int global_mem_cache_size)
+        {
+            _global_mem_cache_size = global_mem_cache_size;
+        }
+
+    private:
+        char _name[256];
+        int _max_work_item_sizes_i[3];
+        bool _host_unified_memory = false;
+        int _major;
+        int _minor;
+        int _integrated = 0;
+        int _frequency;
+        // Set estimated value 3200000 kHz as default value.
+        unsigned int _memory_clock_rate = 3200000;
+        // Set estimated value 64 bits as default value.
+        unsigned int _memory_bus_width = 64;
+        unsigned int _global_mem_cache_size;
+        int _max_compute_units;
+        int _max_work_group_size;
+        int _max_sub_group_size;
+        int _max_work_items_per_compute_unit;
+        int _max_register_size_per_work_group;
+        size_t _global_mem_size;
+        size_t _local_mem_size;
+        size_t _max_nd_range_size[3];
+        int _max_nd_range_size_i[3];
+        uint32_t _device_id;
+        std::array<unsigned char, 16> _uuid;
+    };
+
+    static int get_major_version(const sycl::device &dev)
+    {
+        int major, minor;
+        detail::get_version(dev, major, minor);
+        return major;
+    }
+
+    static int get_minor_version(const sycl::device &dev)
+    {
+        int major, minor;
+        detail::get_version(dev, major, minor);
+        return minor;
+    }
+
+    static void get_device_info(device_info &out, const sycl::device &dev)
+    {
+        device_info prop;
+        prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
+
+        int major, minor;
+        detail::get_version(dev, major, minor);
+        prop.set_major_version(major);
+        prop.set_minor_version(minor);
+
+        prop.set_max_work_item_sizes(
+#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
+            // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
+            // is an enum class element
+            dev.get_info<sycl::info::device::max_work_item_sizes>());
+#else
+            // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
+            // an int
+            dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
+#endif
+        prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
+
+        prop.set_max_clock_frequency(
+            dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
+
+        prop.set_max_compute_units(
+            dev.get_info<sycl::info::device::max_compute_units>());
+        prop.set_max_work_group_size(
+            dev.get_info<sycl::info::device::max_work_group_size>());
+        prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
+        prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
+
+#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
+        if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
+        {
+            unsigned int tmp =
+                dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
+            if (tmp != 0)
+                prop.set_memory_clock_rate(1000 * tmp);
+        }
+        if (dev.has(sycl::aspect::ext_intel_memory_bus_width))
+        {
+            prop.set_memory_bus_width(
+                dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
+        }
+        if (dev.has(sycl::aspect::ext_intel_device_id))
+        {
+            prop.set_device_id(
+                dev.get_info<sycl::ext::intel::info::device::device_id>());
+        }
+        if (dev.has(sycl::aspect::ext_intel_device_info_uuid))
+        {
+            prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
+        }
+#elif defined(_MSC_VER) && !defined(__clang__)
+#pragma message("get_device_info: querying memory_clock_rate and \
+        memory_bus_width are not supported by the compiler used. \
+        Use 3200000 kHz as memory_clock_rate default value. \
+        Use 64 bits as memory_bus_width default value.")
+#else
+#warning "get_device_info: querying memory_clock_rate and \
+        memory_bus_width are not supported by the compiler used. \
+        Use 3200000 kHz as memory_clock_rate default value. \
+        Use 64 bits as memory_bus_width default value."
+#endif
+
+        size_t max_sub_group_size = 1;
+        std::vector<size_t> sub_group_sizes =
+            dev.get_info<sycl::info::device::sub_group_sizes>();
+
+        for (const auto &sub_group_size : sub_group_sizes)
+        {
+            if (max_sub_group_size < sub_group_size)
+                max_sub_group_size = sub_group_size;
+        }
+
+        prop.set_max_sub_group_size(max_sub_group_size);
+
+        prop.set_max_work_items_per_compute_unit(
+            dev.get_info<sycl::info::device::max_work_group_size>());
+        int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
+        prop.set_max_nd_range_size(max_nd_range_size);
+
+        // Estimates max register size per work group, feel free to update the value
+        // according to device properties.
+        prop.set_max_register_size_per_work_group(65536);
+
+        prop.set_global_mem_cache_size(
+            dev.get_info<sycl::info::device::global_mem_cache_size>());
+        out = prop;
+    }
+
+    /// dpct device extension
+    class device_ext : public sycl::device
+    {
+        typedef std::mutex mutex_type;
+
+    public:
+        device_ext() : sycl::device(), _ctx(*this) {}
+        ~device_ext()
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            clear_queues();
+        }
+        device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            init_queues();
+        }
+
+        int is_native_atomic_supported() { return 0; }
+        int get_major_version() const
+        {
+            return dpct::get_major_version(*this);
+        }
+
+        int get_minor_version() const
+        {
+            return dpct::get_minor_version(*this);
+        }
+
+        int get_max_compute_units() const
+        {
+            return get_device_info().get_max_compute_units();
+        }
+
+        /// Return the maximum clock frequency of this device in KHz.
+        int get_max_clock_frequency() const
+        {
+            return get_device_info().get_max_clock_frequency();
+        }
+
+        int get_integrated() const { return get_device_info().get_integrated(); }
+
+        int get_max_sub_group_size() const
+        {
+            return get_device_info().get_max_sub_group_size();
+        }
+
+        int get_max_register_size_per_work_group() const
+        {
+            return get_device_info().get_max_register_size_per_work_group();
+        }
+
+        int get_max_work_group_size() const
+        {
+            return get_device_info().get_max_work_group_size();
+        }
+
+        int get_mem_base_addr_align() const
+        {
+            return get_info<sycl::info::device::mem_base_addr_align>();
+        }
+
+        size_t get_global_mem_size() const
+        {
+            return get_device_info().get_global_mem_size();
+        }
+
+        /// Get the number of bytes of free and total memory on the SYCL device.
+        /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
+        /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
+        void get_memory_info(size_t &free_memory, size_t &total_memory)
+        {
+#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
+            if (!has(sycl::aspect::ext_intel_free_memory))
+            {
+                std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
+                free_memory = 0;
+            }
+            else
+            {
+                free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
+            }
+#else
+            std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
+            free_memory = 0;
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma message("Querying the number of bytes of free memory is not supported")
+#else
+#warning "Querying the number of bytes of free memory is not supported"
+#endif
+#endif
+            total_memory = get_device_info().get_global_mem_size();
+        }
+
+        void get_device_info(device_info &out) const
+        {
+            dpct::get_device_info(out, *this);
+        }
+
+        device_info get_device_info() const
+        {
+            device_info prop;
+            dpct::get_device_info(prop, *this);
+            return prop;
+        }
+
+        void reset()
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            clear_queues();
+            init_queues();
+        }
+
+        sycl::queue &in_order_queue() { return *_q_in_order; }
+
+        sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
+
+        sycl::queue &default_queue()
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            return out_of_order_queue();
+#else
+            return in_order_queue();
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+        void queues_wait_and_throw()
+        {
+            std::unique_lock<mutex_type> lock(m_mutex);
+            std::vector<std::shared_ptr<sycl::queue>> current_queues(
+                _queues);
+            lock.unlock();
+            for (const auto &q : current_queues)
+            {
+                q->wait_and_throw();
+            }
+            // Guard the destruct of current_queues to make sure the ref count is safe.
+            lock.lock();
+        }
+
+        sycl::queue *create_queue(bool enable_exception_handler = false)
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            return create_out_of_order_queue(enable_exception_handler);
+#else
+            return create_in_order_queue(enable_exception_handler);
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+        sycl::queue *create_in_order_queue(bool enable_exception_handler = false)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            return create_queue_impl(enable_exception_handler,
+                                     sycl::property::queue::in_order());
+        }
+
+        sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            return create_queue_impl(enable_exception_handler);
+        }
+
+        void destroy_queue(sycl::queue *&queue)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
+                                         [=](const std::shared_ptr<sycl::queue> &q) -> bool
+                                         {
+                                             return q.get() == queue;
+                                         }),
+                          _queues.end());
+            queue = nullptr;
+        }
+        void set_saved_queue(sycl::queue *q)
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            _saved_queue = q;
+        }
+        sycl::queue *get_saved_queue() const
+        {
+            std::lock_guard<mutex_type> lock(m_mutex);
+            return _saved_queue;
+        }
+        sycl::context get_context() const { return _ctx; }
+
+    private:
+        void clear_queues()
+        {
+            _queues.clear();
+            _q_in_order = _q_out_of_order = _saved_queue = nullptr;
+        }
+
+        void init_queues()
+        {
+            _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
+            _q_out_of_order = create_queue_impl(true);
+            _saved_queue = &default_queue();
+        }
+
+        /// Caller should acquire resource \p m_mutex before calling this function.
+        template <class... Properties>
+        sycl::queue *create_queue_impl(bool enable_exception_handler,
+                                       Properties... properties)
+        {
+            sycl::async_handler eh = {};
+            if (enable_exception_handler)
+            {
+                eh = exception_handler;
+            }
+            _queues.push_back(std::make_shared<sycl::queue>(
+                _ctx, *this, eh,
+                sycl::property_list(
+#ifdef DPCT_PROFILING_ENABLED
+                    sycl::property::queue::enable_profiling(),
+#endif
+                    properties...)));
+
+            return _queues.back().get();
+        }
+
+        void get_version(int &major, int &minor) const
+        {
+            detail::get_version(*this, major, minor);
+        }
+        sycl::queue *_q_in_order, *_q_out_of_order;
+        sycl::queue *_saved_queue;
+        sycl::context _ctx;
+        std::vector<std::shared_ptr<sycl::queue>> _queues;
+        mutable mutex_type m_mutex;
+    };
+
+    /// device manager
+    class dev_mgr
+    {
+    public:
+        device_ext &current_device()
+        {
+            unsigned int dev_id = current_device_id();
+            check_id(dev_id);
+            return *_devs[dev_id];
+        }
+        device_ext &cpu_device() const
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            if (_cpu_device == -1)
+            {
+                throw std::runtime_error("no valid cpu device");
+            }
+            else
+            {
+                return *_devs[_cpu_device];
+            }
+        }
+        device_ext &get_device(unsigned int id) const
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            check_id(id);
+            return *_devs[id];
+        }
+        unsigned int current_device_id() const
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            auto it = _thread2dev_map.find(get_tid());
+            if (it != _thread2dev_map.end())
+                return it->second;
+            return DEFAULT_DEVICE_ID;
+        }
+
+        /// Select device with a device ID.
+        /// \param [in] id The id of the device which can
+        /// be obtained through get_device_id(const sycl::device).
+        void select_device(unsigned int id)
+        {
+            std::lock_guard<std::recursive_mutex> lock(m_mutex);
+            check_id(id);
+            _thread2dev_map[get_tid()] = id;
+        }
+        unsigned int device_count() { return _devs.size(); }
+
+        unsigned int get_device_id(const sycl::device &dev)
+        {
+            unsigned int id = 0;
+            for (auto dev_item : _devs)
+            {
+                if (*dev_item == dev)
+                {
+                    break;
+                }
+                id++;
+            }
+            return id;
+        }
+
+        template <class DeviceSelector>
+        std::enable_if_t<
+            std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
+        select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
+        {
+            sycl::device selected_device = sycl::device(selector);
+            unsigned int selected_device_id = get_device_id(selected_device);
+            select_device(selected_device_id);
+        }
+
+        /// Returns the instance of device manager singleton.
+        static dev_mgr &instance()
+        {
+            static dev_mgr d_m;
+            return d_m;
+        }
+        dev_mgr(const dev_mgr &) = delete;
+        dev_mgr &operator=(const dev_mgr &) = delete;
+        dev_mgr(dev_mgr &&) = delete;
+        dev_mgr &operator=(dev_mgr &&) = delete;
+
+    private:
+        mutable std::recursive_mutex m_mutex;
+        dev_mgr()
+        {
+            sycl::device default_device =
+                sycl::device(sycl::default_selector_v);
+            _devs.push_back(std::make_shared<device_ext>(default_device));
+
+            std::vector<sycl::device> sycl_all_devs =
+                sycl::device::get_devices(sycl::info::device_type::all);
+            // Collect other devices except for the default device.
+            if (default_device.is_cpu())
+                _cpu_device = 0;
+            for (auto &dev : sycl_all_devs)
+            {
+                if (dev == default_device)
+                {
+                    continue;
+                }
+                _devs.push_back(std::make_shared<device_ext>(dev));
+                if (_cpu_device == -1 && dev.is_cpu())
+                {
+                    _cpu_device = _devs.size() - 1;
+                }
+            }
+        }
+        void check_id(unsigned int id) const
+        {
+            if (id >= _devs.size())
+            {
+                throw std::runtime_error("invalid device id");
+            }
+        }
+        std::vector<std::shared_ptr<device_ext>> _devs;
+        /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
+        /// thread id in _thread2dev_map, which means default device should be used
+        /// for the current thread.
+        const unsigned int DEFAULT_DEVICE_ID = 0;
+        /// thread-id to device-id map.
+        std::map<unsigned int, unsigned int> _thread2dev_map;
+        int _cpu_device = -1;
+    };
+
+    static inline sycl::queue &get_default_queue()
+    {
+        return dev_mgr::instance().current_device().default_queue();
+    }
+
+    namespace detail
+    {
+        enum class pointer_access_attribute
+        {
+            host_only = 0,
+            device_only,
+            host_device,
+            end
+        };
+
+        static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
+                                                              const void *ptr)
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            return mem_mgr::instance().is_device_ptr(ptr)
+                       ? pointer_access_attribute::device_only
+                       : pointer_access_attribute::host_only;
+#else
+            switch (sycl::get_pointer_type(ptr, q.get_context()))
+            {
+            case sycl::usm::alloc::unknown:
+                return pointer_access_attribute::host_only;
+            case sycl::usm::alloc::device:
+                return pointer_access_attribute::device_only;
+            case sycl::usm::alloc::shared:
+            case sycl::usm::alloc::host:
+                return pointer_access_attribute::host_device;
+            }
+#endif
+        }
+
+        template <typename ArgT>
+        inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
+        {
+            static_assert((unsigned char)library_data_t::library_data_t_size <=
+                              std::numeric_limits<unsigned char>::max() &&
+                          "library_data_t size exceeds limit.");
+            static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
+            return (std::uint64_t)Val;
+        }
+
+        template <typename FirstT, typename... RestT>
+        inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
+                                                               RestT... RestVal)
+        {
+            static_assert((std::uint8_t)library_data_t::library_data_t_size <=
+                              std::numeric_limits<unsigned char>::max() &&
+                          "library_data_t size exceeds limit.");
+            static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
+            static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
+            return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
+        }
+
+        class mem_mgr
+        {
+            mem_mgr()
+            {
+                // Reserved address space, no real memory allocation happens here.
+#if defined(__linux__)
+                mapped_address_space =
+                    (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
+                                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+#elif defined(_WIN64)
+                mapped_address_space = (byte_t *)VirtualAlloc(
+                    NULL,               // NULL specified as the base address parameter
+                    mapped_region_size, // Size of allocation
+                    MEM_RESERVE,        // Allocate reserved pages
+                    PAGE_NOACCESS);     // Protection = no access
+#else
+#error "Only support Windows and Linux."
+#endif
+                next_free = mapped_address_space;
+            };
+
+        public:
+            using buffer_id_t = int;
+
+            struct allocation
+            {
+                buffer_t buffer;
+                byte_t *alloc_ptr;
+                size_t size;
+            };
+
+            ~mem_mgr()
+            {
+#if defined(__linux__)
+                munmap(mapped_address_space, mapped_region_size);
+#elif defined(_WIN64)
+                VirtualFree(mapped_address_space, 0, MEM_RELEASE);
+#else
+#error "Only support Windows and Linux."
+#endif
+            };
+
+            mem_mgr(const mem_mgr &) = delete;
+            mem_mgr &operator=(const mem_mgr &) = delete;
+            mem_mgr(mem_mgr &&) = delete;
+            mem_mgr &operator=(mem_mgr &&) = delete;
+
+            /// Allocate
+            void *mem_alloc(size_t size)
+            {
+                if (!size)
+                    return nullptr;
+                std::lock_guard<std::mutex> lock(m_mutex);
+                if (next_free + size > mapped_address_space + mapped_region_size)
+                {
+                    throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
+                }
+                // Allocation
+                sycl::range<1> r(size);
+                buffer_t buf(r);
+                allocation A{buf, next_free, size};
+                // Map allocation to device pointer
+                void *result = next_free;
+                m_map.emplace(next_free + size, A);
+                // Update pointer to the next free space.
+                next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
+
+                return result;
+            }
+
+            /// Deallocate
+            void mem_free(const void *ptr)
+            {
+                if (!ptr)
+                    return;
+                std::lock_guard<std::mutex> lock(m_mutex);
+                auto it = get_map_iterator(ptr);
+                m_map.erase(it);
+            }
+
+            /// map: device pointer -> allocation(buffer, alloc_ptr, size)
+            allocation translate_ptr(const void *ptr)
+            {
+                std::lock_guard<std::mutex> lock(m_mutex);
+                auto it = get_map_iterator(ptr);
+                return it->second;
+            }
+
+            /// Check if the pointer represents device pointer or not.
+            bool is_device_ptr(const void *ptr) const
+            {
+                std::lock_guard<std::mutex> lock(m_mutex);
+                return (mapped_address_space <= ptr) &&
+                       (ptr < mapped_address_space + mapped_region_size);
+            }
+
+            /// Returns the instance of memory manager singleton.
+            static mem_mgr &instance()
+            {
+                static mem_mgr m;
+                return m;
+            }
+
+        private:
+            std::map<byte_t *, allocation> m_map;
+            mutable std::mutex m_mutex;
+            byte_t *mapped_address_space;
+            byte_t *next_free;
+            const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
+            const size_t alignment = 256;
+            /// This padding may be defined to some positive value to debug
+            /// out of bound accesses.
+            const size_t extra_padding = 0;
+
+            std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
+            {
+                auto it = m_map.upper_bound((byte_t *)ptr);
+                if (it == m_map.end())
+                {
+                    // Not a virtual pointer.
+                    throw std::runtime_error("can not get buffer from non-virtual pointer");
+                }
+                const allocation &alloc = it->second;
+                if (ptr < alloc.alloc_ptr)
+                {
+                    // Out of bound.
+                    // This may happen if there's a gap between allocations due to alignment
+                    // or extra padding and pointer points to this gap.
+                    throw std::runtime_error("invalid virtual pointer");
+                }
+                return it;
+            }
+        };
+
+        template <class T, memory_region Memory, size_t Dimension>
+        class accessor;
+        template <memory_region Memory, class T = byte_t>
+        class memory_traits
+        {
+        public:
+            static constexpr sycl::access::target target =
+                sycl::access::target::device;
+            static constexpr sycl::access_mode mode =
+                (Memory == constant) ? sycl::access_mode::read
+                                     : sycl::access_mode::read_write;
+            static constexpr size_t type_size = sizeof(T);
+            using element_t =
+                typename std::conditional<Memory == constant, const T, T>::type;
+            using value_t = typename std::remove_cv<T>::type;
+            template <size_t Dimension = 1>
+            using accessor_t = typename std::conditional<
+                Memory == local, sycl::local_accessor<value_t, Dimension>,
+                sycl::accessor<T, Dimension, mode, target>>::type;
+            using pointer_t = T *;
+        };
+
+        static inline void *dpct_malloc(size_t size, sycl::queue &q)
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
+#else
+            return sycl::malloc_device(size, q.get_device(), q.get_context());
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
+        static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
+                                        sycl::queue &q)
+        {
+            pitch = PITCH_DEFAULT_ALIGN(x);
+            return dpct_malloc(pitch * y * z, q);
+        }
+
+        /**
+         * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] dev_ptr Pointer to the virtual device memory address.
+         * @param [in] value The value to be set.
+         * @param [in] size Number of elements to be set to the value.
+         * @return An event representing the memset operation.
+         */
+        template <typename valueT>
+        static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
+                                              valueT value, size_t size)
+        {
+#ifdef DPCT_USM_LEVEL_NONE
+            auto &mm = mem_mgr::instance();
+            assert(mm.is_device_ptr(dev_ptr));
+            auto alloc = mm.translate_ptr(dev_ptr);
+            size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
+
+            return q.submit([&](sycl::handler &cgh)
+                            {
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    auto new_buffer = alloc.buffer.reinterpret<valueT>(
+        sycl::range<1>(alloc.size / sizeof(valueT)));
+    sycl::accessor<valueT, 1, sycl::access_mode::write,
+                sycl::access::target::device>
+        acc(new_buffer, cgh, r, o);
+    cgh.fill(acc, value); });
+#else
+            return q.fill(dev_ptr, value, size);
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+        /**
+         * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] data Pointer to the pitched device memory region.
+         * @param [in] value The value to be set.
+         * @param [in] size 3D memory region by number of elements.
+         * @return An event list representing the memset operations.
+         */
+        template <typename valueT>
+        static inline std::vector<sycl::event>
+        dpct_memset(sycl::queue &q, pitched_data data, valueT value,
+                    sycl::range<3> size)
+        {
+            std::vector<sycl::event> event_list;
+            size_t slice = data.get_pitch() * data.get_y();
+            unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
+            for (size_t z = 0; z < size.get(2); ++z)
+            {
+                unsigned char *data_ptr = data_surface;
+                for (size_t y = 0; y < size.get(1); ++y)
+                {
+                    event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
+                    data_ptr += data.get_pitch();
+                }
+                data_surface += slice;
+            }
+            return event_list;
+        }
+
+        /**
+         * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
+         * @tparam valueT The type of the element to be set.
+         * @param [in] q The queue in which the operation is done.
+         * @param [in] ptr Pointer to the virtual device memory.
+         * @param [in] pitch The pitch size by number of elements, including padding.
+         * @param [in] val The value to be set.
+         * @param [in] x The width of memory region by number of elements.
+         * @param [in] y The height of memory region by number of elements.
+         * @return An event list representing the memset operations.
+         */
+        template <typename valueT>
+        static inline std::vector<sycl::event>
+        dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
+                    size_t y)
+        {
+            return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
+                               sycl::range<3>(x, y, 1));
+        }
+
+        static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
+                                                        const void *from_ptr,
+                                                        memcpy_direction dir)
+        {
+            switch (dir)
+            {
+            case memcpy_direction::host_to_host:
+            case memcpy_direction::host_to_device:
+            case memcpy_direction::device_to_host:
+            case memcpy_direction::device_to_device:
+                return dir;
+            case memcpy_direction::automatic:
+            {
+                // table[to_attribute][from_attribute]
+                static const memcpy_direction
+                    direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
+                                   [static_cast<unsigned>(pointer_access_attribute::end)] =
+                                       {{memcpy_direction::host_to_host,
+                                         memcpy_direction::device_to_host,
+                                         memcpy_direction::host_to_host},
+                                        {memcpy_direction::host_to_device,
+                                         memcpy_direction::device_to_device,
+                                         memcpy_direction::device_to_device},
+                                        {memcpy_direction::host_to_host,
+                                         memcpy_direction::device_to_device,
+                                         memcpy_direction::device_to_device}};
+                return direction_table[static_cast<unsigned>(get_pointer_attribute(
+                    q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
+            }
+            default:
+                throw std::runtime_error("dpct_memcpy: invalid direction value");
+            }
+        }
+
+        static sycl::event
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+                    memcpy_direction direction,
+                    const std::vector<sycl::event> &dep_events = {})
+        {
+            if (!size)
+                return sycl::event{};
+#ifdef DPCT_USM_LEVEL_NONE
+            auto &mm = mem_mgr::instance();
+            auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+
+            switch (real_direction)
+            {
+            case host_to_host:
+                return q.submit([&](sycl::handler &cgh)
+                                {
+    cgh.depends_on(dep_events);
+    cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
+            case host_to_device:
+            {
+                auto alloc = mm.translate_ptr(to_ptr);
+                size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
+                return q.submit([&](sycl::handler &cgh)
+                                {
+    cgh.depends_on(dep_events);
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                        sycl::access::target::device>
+        acc(alloc.buffer, cgh, r, o);
+    cgh.copy(from_ptr, acc); });
+            }
+            case device_to_host:
+            {
+                auto alloc = mm.translate_ptr(from_ptr);
+                size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
+                return q.submit([&](sycl::handler &cgh)
+                                {
+    cgh.depends_on(dep_events);
+    auto r = sycl::range<1>(size);
+    auto o = sycl::id<1>(offset);
+    sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                        sycl::access::target::device>
+        acc(alloc.buffer, cgh, r, o);
+    cgh.copy(acc, to_ptr); });
+            }
+            case device_to_device:
+            {
+                auto to_alloc = mm.translate_ptr(to_ptr);
+                auto from_alloc = mm.translate_ptr(from_ptr);
+                size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
+                size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
+                return q.submit([&](sycl::handler &cgh)
+                                {
+    cgh.depends_on(dep_events);
+    auto r = sycl::range<1>(size);
+    auto to_o = sycl::id<1>(to_offset);
+    auto from_o = sycl::id<1>(from_offset);
+    sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                        sycl::access::target::device>
+        to_acc(to_alloc.buffer, cgh, r, to_o);
+    sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                        sycl::access::target::device>
+        from_acc(from_alloc.buffer, cgh, r, from_o);
+    cgh.copy(from_acc, to_acc); });
+            }
+            default:
+                throw std::runtime_error("dpct_memcpy: invalid direction value");
+            }
+#else
+            return q.memcpy(to_ptr, from_ptr, size, dep_events);
+#endif // DPCT_USM_LEVEL_NONE
+        }
+
+        // Get actual copy range and make sure it will not exceed range.
+        static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                            size_t pitch)
+        {
+            return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+        }
+
+        static inline size_t get_offset(sycl::id<3> id, size_t slice,
+                                        size_t pitch)
+        {
+            return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+        }
+
+        /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+        /// and \p from_range to another specified by \p to_ptr and \p to_range.
+        static inline std::vector<sycl::event>
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                    sycl::range<3> to_range, sycl::range<3> from_range,
+                    sycl::id<3> to_id, sycl::id<3> from_id,
+                    sycl::range<3> size, memcpy_direction direction,
+                    const std::vector<sycl::event> &dep_events = {})
+        {
+            // RAII for host pointer
+            class host_buffer
+            {
+                void *_buf;
+                size_t _size;
+                sycl::queue &_q;
+                const std::vector<sycl::event> &_deps; // free operation depends
+
+            public:
+                host_buffer(size_t size, sycl::queue &q,
+                            const std::vector<sycl::event> &deps)
+                    : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+                void *get_ptr() const { return _buf; }
+                size_t get_size() const { return _size; }
+                ~host_buffer()
+                {
+                    if (_buf)
+                    {
+                        _q.submit([&](sycl::handler &cgh)
+                                  {
+        cgh.depends_on(_deps);
+        cgh.host_task([buf = _buf] { std::free(buf); }); });
+                    }
+                }
+            };
+            std::vector<sycl::event> event_list;
+
+            size_t to_slice = to_range.get(1) * to_range.get(0),
+                   from_slice = from_range.get(1) * from_range.get(0);
+            unsigned char *to_surface =
+                (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+            const unsigned char *from_surface =
+                (const unsigned char *)from_ptr +
+                get_offset(from_id, from_slice, from_range.get(0));
+
+            if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
+            {
+                return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                                    direction, dep_events)};
+            }
+            direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+            size_t size_slice = size.get(1) * size.get(0);
+            switch (direction)
+            {
+            case host_to_host:
+                for (size_t z = 0; z < size.get(2); ++z)
+                {
+                    unsigned char *to_ptr = to_surface;
+                    const unsigned char *from_ptr = from_surface;
+                    if (to_range.get(0) == from_range.get(0) &&
+                        to_range.get(0) == size.get(0))
+                    {
+                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
+                                                         direction, dep_events));
+                    }
+                    else
+                    {
+                        for (size_t y = 0; y < size.get(1); ++y)
+                        {
+                            event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
+                                                             direction, dep_events));
+                            to_ptr += to_range.get(0);
+                            from_ptr += from_range.get(0);
+                        }
+                    }
+                    to_surface += to_slice;
+                    from_surface += from_slice;
+                }
+                break;
+            case host_to_device:
+            {
+                host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                                event_list);
+                std::vector<sycl::event> host_events;
+                if (to_slice == size_slice)
+                {
+                    // Copy host data to a temp host buffer with the shape of target.
+                    host_events =
+                        dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                                    host_to_host, dep_events);
+                }
+                else
+                {
+                    // Copy host data to a temp host buffer with the shape of target.
+                    host_events = dpct_memcpy(
+                        q, buf.get_ptr(), from_surface, to_range, from_range,
+                        sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
+                        // If has padding data, not sure whether it is useless. So fill temp
+                        // buffer with it.
+                        std::vector<sycl::event>{
+                            dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                                        device_to_host, dep_events)});
+                }
+                // Copy from temp host buffer to device with only one submit.
+                event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
+                                                 buf.get_size(), host_to_device,
+                                                 host_events));
+                break;
+            }
+            case device_to_host:
+            {
+                host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                                event_list);
+                // Copy from host temp buffer to host target with reshaping.
+                event_list = dpct_memcpy(
+                    q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
+                    sycl::id<3>(0, 0, 0), size, host_to_host,
+                    // Copy from device to temp host buffer with only one submit.
+                    std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
+                                                         buf.get_size(),
+                                                         device_to_host, dep_events)});
+                break;
+            }
+            case device_to_device:
+#ifdef DPCT_USM_LEVEL_NONE
+            {
+                auto &mm = mem_mgr::instance();
+                auto to_alloc = mm.translate_ptr(to_surface);
+                auto from_alloc = mm.translate_ptr(from_surface);
+                size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
+                size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
+                event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                              {
+    cgh.depends_on(dep_events);
+    auto to_o = sycl::id<1>(to_offset);
+    auto from_o = sycl::id<1>(from_offset);
+    sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                        sycl::access::target::device>
+        to_acc(to_alloc.buffer, cgh,
+                get_copy_range(size, to_slice, to_range.get(0)), to_o);
+    sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                        sycl::access::target::device>
+        from_acc(from_alloc.buffer, cgh,
+                get_copy_range(size, from_slice, from_range.get(0)), from_o);
+    cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
+        size,
+        [=](sycl::id<3> id) {
+            to_acc[get_offset(id, to_slice, to_range.get(0))] =
+                from_acc[get_offset(id, from_slice, from_range.get(0))];
+        }); }));
+            }
+#else
+                event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                              {
+    cgh.depends_on(dep_events);
+    cgh.parallel_for<class dpct_memcpy_3d_detail>(
+        size,
+        [=](sycl::id<3> id) {
+            to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                from_surface[get_offset(id, from_slice, from_range.get(0))];
+        }); }));
+#endif
+            break;
+            default:
+                throw std::runtime_error("dpct_memcpy: invalid direction value");
+            }
+            return event_list;
+        }
+
+        /// memcpy 2D/3D matrix specified by pitched_data.
+        static inline std::vector<sycl::event>
+        dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                    pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                    memcpy_direction direction = automatic)
+        {
+            return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                               sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                               sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                               size, direction);
+        }
+
+        /// memcpy 2D matrix with pitch.
+        static inline std::vector<sycl::event>
+        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                    size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                    memcpy_direction direction = automatic)
+        {
+            return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                               sycl::range<3>(from_pitch, y, 1),
+                               sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                               sycl::range<3>(x, y, 1), direction);
+        }
+
+        namespace deprecated
+        {
+
+            template <typename T, sycl::usm::alloc AllocKind>
+            class usm_allocator
+            {
+            private:
+                using Alloc = sycl::usm_allocator<T, AllocKind>;
+                Alloc _impl;
+
+            public:
+                using value_type = typename std::allocator_traits<Alloc>::value_type;
+                using pointer = typename std::allocator_traits<Alloc>::pointer;
+                using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
+                using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
+                using const_void_pointer =
+                    typename std::allocator_traits<Alloc>::const_void_pointer;
+                using reference = typename std::allocator_traits<Alloc>::value_type &;
+                using const_reference =
+                    const typename std::allocator_traits<Alloc>::value_type &;
+                using difference_type =
+                    typename std::allocator_traits<Alloc>::difference_type;
+                using size_type = typename std::allocator_traits<Alloc>::size_type;
+                using propagate_on_container_copy_assignment = typename std::allocator_traits<
+                    Alloc>::propagate_on_container_copy_assignment;
+                using propagate_on_container_move_assignment = typename std::allocator_traits<
+                    Alloc>::propagate_on_container_move_assignment;
+                using propagate_on_container_swap =
+                    typename std::allocator_traits<Alloc>::propagate_on_container_swap;
+                using is_always_equal =
+                    typename std::allocator_traits<Alloc>::is_always_equal;
+
+                template <typename U>
+                struct rebind
+                {
+                    typedef usm_allocator<U, AllocKind> other;
+                };
+
+                usm_allocator() : _impl(dpct::get_default_queue()) {}
+                ~usm_allocator() {}
+                usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
+                usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
+                pointer address(reference r) { return &r; }
+                const_pointer address(const_reference r) { return &r; }
+                pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
+                {
+                    return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
+                }
+                void deallocate(pointer p, size_type cnt)
+                {
+                    std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
+                }
+                size_type max_size() const
+                {
+                    return std::allocator_traits<Alloc>::max_size(_impl);
+                }
+                bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
+                bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
+            };
+
+        } // namespace deprecated
+
+        inline void dpct_free(void *ptr,
+                              const sycl::queue &q)
+        {
+            if (ptr)
+            {
+#ifdef DPCT_USM_LEVEL_NONE
+                detail::mem_mgr::instance().mem_free(ptr);
+#else
+                sycl::free(ptr, q.get_context());
+#endif // DPCT_USM_LEVEL_NONE
+            }
+        }
+
+        template <typename T>
+        inline auto get_memory(const void *x)
+        {
+            T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
+#ifdef DPCT_USM_LEVEL_NONE
+            return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
+#else
+            return new_x;
+#endif
+        }
+
+        template <typename T>
+        inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
+        {
+            using Ty = typename DataType<T>::T2;
+            Ty s_h;
+            if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
+                detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
+                    .wait();
+            else
+                s_h = *reinterpret_cast<const Ty *>(s);
+            return s_h;
+        }
+
+    } // namespace detail
+
+    template <typename T>
+    inline auto get_value(const T *s, sycl::queue &q)
+    {
+        return detail::get_value(s, q);
+    }
+
+    namespace detail
+    {
+        template <class Ta, class Tb, class Tc, class Ts>
+        inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                              oneapi::mkl::transpose b_trans, int m, int n, int k,
+                              const void *alpha, const void *a, int lda, const void *b,
+                              int ldb, const void *beta, void *c, int ldc)
+        {
+#ifndef __INTEL_MKL__
+            throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
+                                     "Project does not support this API.");
+#else
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+            auto data_a = get_memory<const Ta>(a);
+            auto data_b = get_memory<const Tb>(b);
+            auto data_c = get_memory<Tc>(c);
+            oneapi::mkl::blas::column_major::gemm(
+                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+                data_b, ldb, beta_value, data_c, ldc);
+#endif
+        }
+
+        template <typename VecT, class BinaryOperation, class = void>
+        class vectorized_binary
+        {
+        public:
+            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
+            {
+                VecT v4;
+                for (size_t i = 0; i < v4.size(); ++i)
+                {
+                    v4[i] = binary_op(a[i], b[i]);
+                }
+                return v4;
+            }
+        };
+
+        template <typename VecT, class BinaryOperation>
+        class vectorized_binary<
+            VecT, BinaryOperation,
+            std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>>
+        {
+        public:
+            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
+            {
+                return binary_op(a, b).template as<VecT>();
+            }
+        };
+
+        template <class Ta, class Tb, class Tc, class Ts>
+        inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                                    oneapi::mkl::transpose b_trans, int m, int n, int k,
+                                    const void *alpha, const void **a, int lda,
+                                    const void **b, int ldb, const void *beta, void **c,
+                                    int ldc, int batch_size)
+        {
+            struct matrix_info_t
+            {
+                oneapi::mkl::transpose transpose_info[2];
+                Ts value_info[2];
+                std::int64_t size_info[3];
+                std::int64_t ld_info[3];
+                std::int64_t groupsize_info;
+            };
+
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+
+            matrix_info_t *matrix_info =
+                (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
+            matrix_info->transpose_info[0] = a_trans;
+            matrix_info->transpose_info[1] = b_trans;
+            matrix_info->value_info[0] = alpha_value;
+            matrix_info->value_info[1] = beta_value;
+            matrix_info->size_info[0] = m;
+            matrix_info->size_info[1] = n;
+            matrix_info->size_info[2] = k;
+            matrix_info->ld_info[0] = lda;
+            matrix_info->ld_info[1] = ldb;
+            matrix_info->ld_info[2] = ldc;
+            matrix_info->groupsize_info = batch_size;
+
+            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
+                q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
+                matrix_info->size_info, matrix_info->size_info + 1,
+                matrix_info->size_info + 2, matrix_info->value_info,
+                reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
+                reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
+                matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
+                matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
+
+            q.submit([&](sycl::handler &cgh)
+                     {
+    cgh.depends_on(e);
+    cgh.host_task([=] { std::free(matrix_info); }); });
+        }
+
+        template <class Ta, class Tb, class Tc, class Ts>
+        inline void
+        gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                        oneapi::mkl::transpose b_trans, int m, int n,
+                        int k, const void *alpha, const void *a, int lda,
+                        long long int stride_a, const void *b, int ldb,
+                        long long int stride_b, const void *beta, void *c,
+                        int ldc, long long int stride_c, int batch_size)
+        {
+            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+            auto data_a = get_memory<const Ta>(a);
+            auto data_b = get_memory<const Tb>(b);
+            auto data_c = get_memory<Tc>(c);
+            oneapi::mkl::blas::column_major::gemm_batch(
+                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+                stride_a, data_b, ldb, stride_b, beta_value,
+                data_c, ldc, stride_c, batch_size);
+        }
+
+    } // namespace detail
+
+    template <typename VecT, class BinaryOperation>
+    inline unsigned vectorized_binary(unsigned a, unsigned b,
+                                      const BinaryOperation binary_op)
+    {
+        sycl::vec<unsigned, 1> v0{a}, v1{b};
+        auto v2 = v0.as<VecT>();
+        auto v3 = v1.as<VecT>();
+        auto v4 =
+            detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
+        v0 = v4.template as<sycl::vec<unsigned, 1>>();
+        return v0;
+    }
+
+    static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
+                                  memcpy_direction direction = automatic,
+                                  sycl::queue &q = dpct::get_default_queue())
+    {
+        detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
+    }
+
+    static inline unsigned int select_device(unsigned int id)
+    {
+        dev_mgr::instance().select_device(id);
+        return id;
+    }
+
+    template <typename T>
+    T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
+                               int logical_sub_group_size = 32)
+    {
+        unsigned int id = g.get_local_linear_id();
+        unsigned int start_index =
+            id / logical_sub_group_size * logical_sub_group_size;
+        unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
+        return sycl::select_from_group(g, x,
+                                       target_offset < logical_sub_group_size
+                                           ? start_index + target_offset
+                                           : id);
+    }
+
+    template <typename T>
+    sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
+    {
+        return sycl::vec<T, 1>(val)
+            .template as<sycl::vec<
+                std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
+            .template convert<T>();
+    }
+
+    template <typename T1, typename T2>
+    using dot_product_acc_t =
+        std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
+                           uint32_t, int32_t>;
+
+    template <typename T1, typename T2, typename T3>
+    inline auto dp4a(T1 a, T2 b, T3 c)
+    {
+        dot_product_acc_t<T1, T2> res = c;
+        auto va = extract_and_sign_or_zero_extend4(a);
+        auto vb = extract_and_sign_or_zero_extend4(b);
+        res += va[0] * vb[0];
+        res += va[1] * vb[1];
+        res += va[2] * vb[2];
+        res += va[3] * vb[3];
+        return res;
+    }
+
+    struct sub_sat
+    {
+        template <typename T>
+        auto operator()(const T x, const T y) const
+        {
+            return sycl::sub_sat(x, y);
+        }
+    };
+
+    template <typename S, typename T>
+    inline T vectorized_min(T a, T b)
+    {
+        sycl::vec<T, 1> v0{a}, v1{b};
+        auto v2 = v0.template as<S>();
+        auto v3 = v1.template as<S>();
+        auto v4 = sycl::min(v2, v3);
+        v0 = v4.template as<sycl::vec<T, 1>>();
+        return v0;
+    }
+
+    inline float pow(const float a, const int b) { return sycl::pown(a, b); }
+    inline double pow(const double a, const int b) { return sycl::pown(a, b); }
+    inline float pow(const float a, const float b) { return sycl::pow(a, b); }
+    inline double pow(const double a, const double b) { return sycl::pow(a, b); }
+    template <typename T, typename U>
+    inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
+    pow(const T a, const U b)
+    {
+        return sycl::pow(a, static_cast<T>(b));
+    }
+    template <typename T, typename U>
+    inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
+    pow(const T a, const U b)
+    {
+        return sycl::pow(static_cast<double>(a), static_cast<double>(b));
+    }
+
+    inline double min(const double a, const float b)
+    {
+        return sycl::fmin(a, static_cast<double>(b));
+    }
+    inline double min(const float a, const double b)
+    {
+        return sycl::fmin(static_cast<double>(a), b);
+    }
+    inline float min(const float a, const float b) { return sycl::fmin(a, b); }
+    inline double min(const double a, const double b) { return sycl::fmin(a, b); }
+    inline std::uint32_t min(const std::uint32_t a, const std::int32_t b)
+    {
+        return sycl::min(a, static_cast<std::uint32_t>(b));
+    }
+    inline std::uint32_t min(const std::int32_t a, const std::uint32_t b)
+    {
+        return sycl::min(static_cast<std::uint32_t>(a), b);
+    }
+    inline std::int32_t min(const std::int32_t a, const std::int32_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::int64_t b)
+    {
+        return sycl::min(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t min(const std::int64_t a, const std::uint64_t b)
+    {
+        return sycl::min(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::int64_t min(const std::int64_t a, const std::int64_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b)
+    {
+        return sycl::min(a, b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::int32_t b)
+    {
+        return sycl::min(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t min(const std::int32_t a, const std::uint64_t b)
+    {
+        return sycl::min(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b)
+    {
+        return sycl::min(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b)
+    {
+        return sycl::min(static_cast<std::uint64_t>(a), b);
+    }
+    // max function overloads.
+    // For floating-point types, `float` or `double` arguments are acceptable.
+    // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
+    // `std::int64_t` type arguments are acceptable.
+    inline double max(const double a, const float b)
+    {
+        return sycl::fmax(a, static_cast<double>(b));
+    }
+    inline double max(const float a, const double b)
+    {
+        return sycl::fmax(static_cast<double>(a), b);
+    }
+    inline float max(const float a, const float b) { return sycl::fmax(a, b); }
+    inline double max(const double a, const double b) { return sycl::fmax(a, b); }
+    inline std::uint32_t max(const std::uint32_t a, const std::int32_t b)
+    {
+        return sycl::max(a, static_cast<std::uint32_t>(b));
+    }
+    inline std::uint32_t max(const std::int32_t a, const std::uint32_t b)
+    {
+        return sycl::max(static_cast<std::uint32_t>(a), b);
+    }
+    inline std::int32_t max(const std::int32_t a, const std::int32_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::int64_t b)
+    {
+        return sycl::max(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t max(const std::int64_t a, const std::uint64_t b)
+    {
+        return sycl::max(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::int64_t max(const std::int64_t a, const std::int64_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b)
+    {
+        return sycl::max(a, b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::int32_t b)
+    {
+        return sycl::max(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t max(const std::int32_t a, const std::uint64_t b)
+    {
+        return sycl::max(static_cast<std::uint64_t>(a), b);
+    }
+    inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b)
+    {
+        return sycl::max(a, static_cast<std::uint64_t>(b));
+    }
+    inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b)
+    {
+        return sycl::max(static_cast<std::uint64_t>(a), b);
+    }
+
+    inline void
+    has_capability_or_fail(const sycl::device &dev,
+                           const std::initializer_list<sycl::aspect> &props)
+    {
+        for (const auto &it : props)
+        {
+            if (dev.has(it))
+                continue;
+            switch (it)
+            {
+            case sycl::aspect::fp64:
+                throw std::runtime_error("'double' is not supported in '" +
+                                         dev.get_info<sycl::info::device::name>() +
+                                         "' device");
+                break;
+            case sycl::aspect::fp16:
+                throw std::runtime_error("'half' is not supported in '" +
+                                         dev.get_info<sycl::info::device::name>() +
+                                         "' device");
+                break;
+            default:
+#define __SYCL_ASPECT(ASPECT, ID) \
+    case sycl::aspect::ASPECT:    \
+        return #ASPECT;
+#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
+#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
+                auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
+                {
+                    switch (AspectNum)
+                    {
+#include <sycl/info/aspects.def>
+#include <sycl/info/aspects_deprecated.def>
+                    default:
+                        return "unknown aspect";
+                    }
+                };
+#undef __SYCL_ASPECT_DEPRECATED_ALIAS
+#undef __SYCL_ASPECT_DEPRECATED
+#undef __SYCL_ASPECT
+                throw std::runtime_error(
+                    "'" + getAspectNameStr(it) + "' is not supported in '" +
+                    dev.get_info<sycl::info::device::name>() + "' device");
+            }
+            break;
+        }
+    }
+
+    static inline unsigned int get_current_device_id()
+    {
+        return dev_mgr::instance().current_device_id();
+    }
+
+    static inline device_ext &get_current_device()
+    {
+        return dev_mgr::instance().current_device();
+    }
+
+    static inline sycl::queue &get_in_order_queue()
+    {
+        return dev_mgr::instance().current_device().in_order_queue();
+    }
+
+    static sycl::event
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
+                memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
+    {
+        if (!size)
+            return sycl::event{};
+#ifdef DPCT_USM_LEVEL_NONE
+        auto &mm = mem_mgr::instance();
+        auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+
+        switch (real_direction)
+        {
+        case host_to_host:
+            return q.submit([&](sycl::handler &cgh)
+                            {
+        cgh.depends_on(dep_events);
+        cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
+        case host_to_device:
+        {
+            auto alloc = mm.translate_ptr(to_ptr);
+            size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
+            return q.submit([&](sycl::handler &cgh)
+                            {
+        cgh.depends_on(dep_events);
+        auto r = sycl::range<1>(size);
+        auto o = sycl::id<1>(offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                            sycl::access::target::device>
+            acc(alloc.buffer, cgh, r, o);
+        cgh.copy(from_ptr, acc); });
+        }
+        case device_to_host:
+        {
+            auto alloc = mm.translate_ptr(from_ptr);
+            size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
+            return q.submit([&](sycl::handler &cgh)
+                            {
+        cgh.depends_on(dep_events);
+        auto r = sycl::range<1>(size);
+        auto o = sycl::id<1>(offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                            sycl::access::target::device>
+            acc(alloc.buffer, cgh, r, o);
+        cgh.copy(acc, to_ptr); });
+        }
+        case device_to_device:
+        {
+            auto to_alloc = mm.translate_ptr(to_ptr);
+            auto from_alloc = mm.translate_ptr(from_ptr);
+            size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
+            size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
+            return q.submit([&](sycl::handler &cgh)
+                            {
+        cgh.depends_on(dep_events);
+        auto r = sycl::range<1>(size);
+        auto to_o = sycl::id<1>(to_offset);
+        auto from_o = sycl::id<1>(from_offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                            sycl::access::target::device>
+            to_acc(to_alloc.buffer, cgh, r, to_o);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                            sycl::access::target::device>
+            from_acc(from_alloc.buffer, cgh, r, from_o);
+        cgh.copy(from_acc, to_acc); });
+        }
+        default:
+            throw std::runtime_error("dpct_memcpy: invalid direction value");
+        }
+#else
+        return q.memcpy(to_ptr, from_ptr, size, dep_events);
+#endif // DPCT_USM_LEVEL_NONE
+    }
+
+    // Get actual copy range and make sure it will not exceed range.
+    static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
+                                        size_t pitch)
+    {
+        return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
+    }
+
+    static inline size_t get_offset(sycl::id<3> id, size_t slice,
+                                    size_t pitch)
+    {
+        return slice * id.get(2) + pitch * id.get(1) + id.get(0);
+    }
+
+    /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
+    /// and \p from_range to another specified by \p to_ptr and \p to_range.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                sycl::range<3> to_range, sycl::range<3> from_range,
+                sycl::id<3> to_id, sycl::id<3> from_id,
+                sycl::range<3> size, memcpy_direction direction,
+                const std::vector<sycl::event> &dep_events = {})
+    {
+        // RAII for host pointer
+        class host_buffer
+        {
+            void *_buf;
+            size_t _size;
+            sycl::queue &_q;
+            const std::vector<sycl::event> &_deps; // free operation depends
+
+        public:
+            host_buffer(size_t size, sycl::queue &q,
+                        const std::vector<sycl::event> &deps)
+                : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+            void *get_ptr() const { return _buf; }
+            size_t get_size() const { return _size; }
+            ~host_buffer()
+            {
+                if (_buf)
+                {
+                    _q.submit([&](sycl::handler &cgh)
+                              {
+            cgh.depends_on(_deps);
+            cgh.host_task([buf = _buf] { std::free(buf); }); });
+                }
+            }
+        };
+        std::vector<sycl::event> event_list;
+
+        size_t to_slice = to_range.get(1) * to_range.get(0),
+               from_slice = from_range.get(1) * from_range.get(0);
+        unsigned char *to_surface =
+            (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
+        const unsigned char *from_surface =
+            (const unsigned char *)from_ptr +
+            get_offset(from_id, from_slice, from_range.get(0));
+
+        if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
+        {
+            return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
+                                direction, dep_events)};
+        }
+        direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
+        size_t size_slice = size.get(1) * size.get(0);
+        switch (direction)
+        {
+        case host_to_host:
+            for (size_t z = 0; z < size.get(2); ++z)
+            {
+                unsigned char *to_ptr = to_surface;
+                const unsigned char *from_ptr = from_surface;
+                if (to_range.get(0) == from_range.get(0) &&
+                    to_range.get(0) == size.get(0))
+                {
+                    event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
+                                                     direction, dep_events));
+                }
+                else
+                {
+                    for (size_t y = 0; y < size.get(1); ++y)
+                    {
+                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
+                                                         direction, dep_events));
+                        to_ptr += to_range.get(0);
+                        from_ptr += from_range.get(0);
+                    }
+                }
+                to_surface += to_slice;
+                from_surface += from_slice;
+            }
+            break;
+        case host_to_device:
+        {
+            host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
+                            event_list);
+            std::vector<sycl::event> host_events;
+            if (to_slice == size_slice)
+            {
+                // Copy host data to a temp host buffer with the shape of target.
+                host_events =
+                    dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
+                                sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
+                                host_to_host, dep_events);
+            }
+            else
+            {
+                // Copy host data to a temp host buffer with the shape of target.
+                host_events = dpct_memcpy(
+                    q, buf.get_ptr(), from_surface, to_range, from_range,
+                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
+                    // If has padding data, not sure whether it is useless. So fill temp
+                    // buffer with it.
+                    std::vector<sycl::event>{
+                        dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
+                                    device_to_host, dep_events)});
+            }
+            // Copy from temp host buffer to device with only one submit.
+            event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
+                                             buf.get_size(), host_to_device,
+                                             host_events));
+            break;
+        }
+        case device_to_host:
+        {
+            host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
+                            event_list);
+            // Copy from host temp buffer to host target with reshaping.
+            event_list = dpct_memcpy(
+                q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
+                sycl::id<3>(0, 0, 0), size, host_to_host,
+                // Copy from device to temp host buffer with only one submit.
+                std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
+                                                     buf.get_size(),
+                                                     device_to_host, dep_events)});
+            break;
+        }
+        case device_to_device:
+#ifdef DPCT_USM_LEVEL_NONE
+        {
+            auto &mm = mem_mgr::instance();
+            auto to_alloc = mm.translate_ptr(to_surface);
+            auto from_alloc = mm.translate_ptr(from_surface);
+            size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
+            size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
+            event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                          {
+        cgh.depends_on(dep_events);
+        auto to_o = sycl::id<1>(to_offset);
+        auto from_o = sycl::id<1>(from_offset);
+        sycl::accessor<byte_t, 1, sycl::access_mode::write,
+                            sycl::access::target::device>
+            to_acc(to_alloc.buffer, cgh,
+                    get_copy_range(size, to_slice, to_range.get(0)), to_o);
+        sycl::accessor<byte_t, 1, sycl::access_mode::read,
+                            sycl::access::target::device>
+            from_acc(from_alloc.buffer, cgh,
+                    get_copy_range(size, from_slice, from_range.get(0)), from_o);
+        cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
+            size,
+            [=](sycl::id<3> id) {
+                to_acc[get_offset(id, to_slice, to_range.get(0))] =
+                    from_acc[get_offset(id, from_slice, from_range.get(0))];
+            }); }));
+        }
+#else
+            event_list.push_back(q.submit([&](sycl::handler &cgh)
+                                          {
+        cgh.depends_on(dep_events);
+        cgh.parallel_for<class dpct_memcpy_3d_detail>(
+            size,
+            [=](sycl::id<3> id) {
+                to_surface[get_offset(id, to_slice, to_range.get(0))] =
+                    from_surface[get_offset(id, from_slice, from_range.get(0))];
+            }); }));
+#endif
+        break;
+        default:
+            throw std::runtime_error("dpct_memcpy: invalid direction value");
+        }
+        return event_list;
+    }
+
+    /// memcpy 2D/3D matrix specified by pitched_data.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
+                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
+                memcpy_direction direction = automatic)
+    {
+        return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
+                           sycl::range<3>(to.get_pitch(), to.get_y(), 1),
+                           sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
+                           size, direction);
+    }
+
+    /// memcpy 2D matrix with pitch.
+    static inline std::vector<sycl::event>
+    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
+                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
+                memcpy_direction direction = automatic)
+    {
+        return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
+                           sycl::range<3>(from_pitch, y, 1),
+                           sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
+                           sycl::range<3>(x, y, 1), direction);
+    }
+
+    inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                     oneapi::mkl::transpose b_trans, int m, int n, int k,
+                     const void *alpha, const void *a, library_data_t a_type,
+                     int lda, const void *b, library_data_t b_type, int ldb,
+                     const void *beta, void *c, library_data_t c_type, int ldc,
+                     library_data_t scaling_type)
+    {
+        if (scaling_type == library_data_t::real_float &&
+            c_type == library_data_t::complex_float)
+        {
+            scaling_type = library_data_t::complex_float;
+        }
+        else if (scaling_type == library_data_t::real_double &&
+                 c_type == library_data_t::complex_double)
+        {
+            scaling_type = library_data_t::complex_double;
+        }
+
+        std::uint64_t key =
+            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+        switch (key)
+        {
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+            library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+            library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_impl<std::complex<float>, std::complex<float>,
+                              std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+            library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_impl<std::complex<double>, std::complex<double>,
+                              std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
+                                          lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
+                                     ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
+                              sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
+                                          a, lda, b, ldb, &beta_half, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                              oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_int32, library_data_t::real_int32):
+        {
+            float alpha_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+            float beta_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+            detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
+                q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
+        }
+    } // gemm()
+
+    /// Computes a batch of matrix-matrix product with general matrices.
+    /// \param [in] q The queue where the routine should be executed.
+    /// \param [in] a_trans Specifies the operation applied to A.
+    /// \param [in] b_trans Specifies the operation applied to B.
+    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+    /// \param [in] alpha Scaling factor for the matrix-matrix product.
+    /// \param [in] a Input matrix A.
+    /// \param [in] a_type Data type of the matrix A.
+    /// \param [in] lda Leading dimension of A.
+    /// \param [in] b Input matrix B.
+    /// \param [in] b_type Data type of the matrix B.
+    /// \param [in] ldb Leading dimension of B.
+    /// \param [in] beta Scaling factor for matrix C.
+    /// \param [in, out] c Input/Output matrix C.
+    /// \param [in] c_type Data type of the matrix C.
+    /// \param [in] ldc Leading dimension of C.
+    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+    /// \param [in] scaling_type Data type of the scaling factors.
+    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                           oneapi::mkl::transpose b_trans, int m, int n, int k,
+                           const void *alpha, const void *a[],
+                           library_data_t a_type, int lda, const void *b[],
+                           library_data_t b_type, int ldb, const void *beta,
+                           void *c[], library_data_t c_type, int ldc,
+                           int batch_size, library_data_t scaling_type)
+    {
+#ifdef DPCT_USM_LEVEL_NONE
+        throw std::runtime_error("this API is unsupported when USM level is none");
+#else
+        if (scaling_type == library_data_t::real_float &&
+            c_type == library_data_t::complex_float)
+        {
+            scaling_type = library_data_t::complex_float;
+        }
+        else if (scaling_type == library_data_t::real_double &&
+                 c_type == library_data_t::complex_double)
+        {
+            scaling_type = library_data_t::complex_double;
+        }
+
+        std::uint64_t key =
+            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+        switch (key)
+        {
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+            library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_batch_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+            library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                                    std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+            library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                                    std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                                a, lda, b, ldb, beta, c, ldc,
+                                                batch_size);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                                    oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                           b, ldb, beta, c, ldc, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_int32, library_data_t::real_int32):
+        {
+            float alpha_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
+            float beta_float =
+                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                                    float>(q, a_trans, b_trans, m, n, k, &alpha_float,
+                                           a, lda, b, ldb, &beta_float, c, ldc,
+                                           batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
+                batch_size);
+            break;
+        }
+#endif
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
+                batch_size);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
+        }
+#endif
+    }
+
+    /// Computes a batch of matrix-matrix product with general matrices.
+    /// \param [in] q The queue where the routine should be executed.
+    /// \param [in] a_trans Specifies the operation applied to A.
+    /// \param [in] b_trans Specifies the operation applied to B.
+    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
+    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
+    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
+    /// \param [in] alpha Scaling factor for the matrix-matrix product.
+    /// \param [in] a Input matrix A.
+    /// \param [in] a_type Data type of the matrix A.
+    /// \param [in] lda Leading dimension of A.
+    /// \param [in] stride_a Stride between the different A matrices.
+    /// \param [in] b Input matrix B.
+    /// \param [in] b_type Data type of the matrix B.
+    /// \param [in] ldb Leading dimension of B.
+    /// \param [in] stride_b Stride between the different B matrices.
+    /// \param [in] beta Scaling factor for matrix C.
+    /// \param [in, out] c Input/Output matrix C.
+    /// \param [in] c_type Data type of the matrix C.
+    /// \param [in] ldc Leading dimension of C.
+    /// \param [in] stride_c Stride between the different C matrices.
+    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
+    /// \param [in] scaling_type Data type of the scaling factors.
+    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
+                           oneapi::mkl::transpose b_trans, int m, int n, int k,
+                           const void *alpha, const void *a, library_data_t a_type,
+                           int lda, long long int stride_a, const void *b,
+                           library_data_t b_type, int ldb, long long int stride_b,
+                           const void *beta, void *c, library_data_t c_type,
+                           int ldc, long long int stride_c, int batch_size,
+                           library_data_t scaling_type)
+    {
+        if (scaling_type == library_data_t::real_float &&
+            c_type == library_data_t::complex_float)
+        {
+            scaling_type = library_data_t::complex_float;
+        }
+        else if (scaling_type == library_data_t::real_double &&
+                 c_type == library_data_t::complex_double)
+        {
+            scaling_type = library_data_t::complex_double;
+        }
+
+        std::uint64_t key =
+            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
+        switch (key)
+        {
+        case detail::get_type_combination_id(
+            library_data_t::real_float, library_data_t::real_float,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<float, float, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_double, library_data_t::real_double,
+            library_data_t::real_double, library_data_t::real_double):
+        {
+            detail::gemm_batch_impl<double, double, double, double>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_float, library_data_t::complex_float,
+            library_data_t::complex_float, library_data_t::complex_float):
+        {
+            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
+                                    std::complex<float>, std::complex<float>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::complex_double, library_data_t::complex_double,
+            library_data_t::complex_double, library_data_t::complex_double):
+        {
+            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
+                                    std::complex<double>, std::complex<double>>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_half):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
+                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
+                                                a, lda, stride_a, b, ldb, stride_b,
+                                                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+#ifdef __INTEL_MKL__
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_bfloat16, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
+                                    oneapi::mkl::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
+                                           stride_a, b, ldb, stride_b, beta, c, ldc,
+                                           stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_int32, library_data_t::real_int32):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
+                                    std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
+                                                  a, lda, stride_a, b, ldb, stride_b,
+                                                  beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_int8, library_data_t::real_int8,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_float, library_data_t::real_float):
+        {
+            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
+                beta, c, ldc, stride_c, batch_size);
+            break;
+        }
+#endif
+        case detail::get_type_combination_id(
+            library_data_t::real_half, library_data_t::real_half,
+            library_data_t::real_half, library_data_t::real_float):
+        {
+            float alpha_value =
+                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
+            float beta_value =
+                dpct::get_value(reinterpret_cast<const float *>(beta), q);
+            sycl::half alpha_half(alpha_value);
+            sycl::half beta_half(beta_value);
+            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
+                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
+                &beta_half, c, ldc, stride_c, batch_size);
+            break;
+        }
+        default:
+            throw std::runtime_error("the combination of data type is unsupported");
+        }
+    }
+
+    static inline void
+    async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
+                      size_t from_pitch, size_t x, size_t y,
+                      memcpy_direction direction = automatic,
+                      sycl::queue &q = get_default_queue())
+    {
+        detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
+                            direction);
+    }
+
+    using err0 = detail::generic_error_type<struct err0_tag, int>;
+    using err1 = detail::generic_error_type<struct err1_tag, int>;
+
+} // COPY from DPCT head files
+
+
 static int g_ggml_sycl_debug=0;
 #define GGML_SYCL_DEBUG(...) do{if(g_ggml_sycl_debug) printf(__VA_ARGS__);}while(0)
 
@@ -8377,7 +11178,7 @@ static bool g_sycl_loaded = false;
 bool ggml_sycl_loaded(void) {
     return g_sycl_loaded;
 }
-void print_devices(){
+void print_sycl_devices(){
     int device_count = dpct::dev_mgr::instance().device_count();
     fprintf(stderr, "found %d SYCL devices:\n", device_count);
     for (int id = 0; id < device_count; ++id) {
@@ -8422,13 +11223,6 @@ void ggml_init_sycl() try {
     static bool initialized = false;
 
     if (!initialized) {
-        if (get_sycl_env("GGML_SYCL_LIST_DEVICE", 0)!=0){
-            printf("SYCL devices:\n");
-            print_devices();
-            printf("Exit for list devices task. unset GGML_SYCL_LIST_DEVICE to restore LLM work!\n");
-            std::exit(0);
-        }
-
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
 
         printf("GGML_SYCL_DEBUG=%d\n", g_ggml_sycl_debug);
@@ -8457,7 +11251,7 @@ void ggml_init_sycl() try {
 #else
         fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
 #endif
-        print_devices();
+        print_sycl_devices();
         for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
             g_sycl_device_id2index[id].index = -1;
             g_device_caps[id].vmm = 0;
diff --git a/ggml-sycl.h b/ggml-sycl.h
index 1ad1b3737d95a..02bfa72072124 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -15,2810 +15,8 @@ GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, s
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
+GGML_API void   print_sycl_devices();
 
 #ifdef  __cplusplus
 }
-#endif
-
-/*
-Following definition copied from DPCT head files, which are used by ggml-sycl.cpp
-*/
-#ifdef __cplusplus
-
-#include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
-#include <map>
-
-#if defined(__linux__)
-#include <sys/mman.h>
-#elif defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#else
-#error "Only support Windows and Linux."
-#endif
-
-#if defined(__linux__)
-#include <unistd.h>
-#include <sys/syscall.h>
-#endif
-#if defined(_WIN64)
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-#include <windows.h>
-#endif
-
-#define DPCT_COMPATIBILITY_TEMP (900)
-
-#if defined(_MSC_VER)
-#define __dpct_align__(n) __declspec(align(n))
-#define __dpct_inline__ __forceinline
-#else
-#define __dpct_align__(n) __attribute__((aligned(n)))
-#define __dpct_inline__ __inline__ __attribute__((always_inline))
-#endif
-
-#if defined(_MSC_VER)
-#define __dpct_noinline__ __declspec(noinline)
-#else
-#define __dpct_noinline__ __attribute__((noinline))
-#endif
-
-namespace dpct
-{
-    typedef sycl::queue *queue_ptr;
-    typedef sycl::event *event_ptr;
-    typedef char *device_ptr;
-    typedef uint8_t byte_t;
-    typedef sycl::buffer<byte_t> buffer_t;
-
-    /// SYCL default exception handler
-    inline auto exception_handler = [](sycl::exception_list exceptions)
-    {
-        for (std::exception_ptr const &e : exceptions)
-        {
-            try
-            {
-                std::rethrow_exception(e);
-            }
-            catch (sycl::exception const &e)
-            {
-                std::cerr << "Caught asynchronous SYCL exception:" << std::endl
-                          << e.what() << std::endl
-                          << "Exception caught at file:" << __FILE__
-                          << ", line:" << __LINE__ << std::endl;
-            }
-        }
-    };
-
-    enum error_code
-    {
-        success = 0,
-        default_error = 999
-    };
-
-    enum memcpy_direction
-    {
-        host_to_host,
-        host_to_device,
-        device_to_host,
-        device_to_device,
-        automatic
-    };
-
-    enum memory_region
-    {
-        global = 0, // device global memory
-        constant,   // device constant memory
-        local,      // device local memory
-        shared,     // memory which can be accessed by host and device
-    };
-
-    enum class library_data_t : unsigned char
-    {
-        real_float = 0,
-        complex_float,
-        real_double,
-        complex_double,
-        real_half,
-        complex_half,
-        real_bfloat16,
-        complex_bfloat16,
-        real_int4,
-        complex_int4,
-        real_uint4,
-        complex_uint4,
-        real_int8,
-        complex_int8,
-        real_uint8,
-        complex_uint8,
-        real_int16,
-        complex_int16,
-        real_uint16,
-        complex_uint16,
-        real_int32,
-        complex_int32,
-        real_uint32,
-        complex_uint32,
-        real_int64,
-        complex_int64,
-        real_uint64,
-        complex_uint64,
-        real_int8_4,
-        real_int8_32,
-        real_uint8_4,
-        library_data_t_size
-    };
-
-    template <typename T>
-    struct DataType
-    {
-        using T2 = T;
-    };
-    template <typename T>
-    struct DataType<sycl::vec<T, 2>>
-    {
-        using T2 = std::complex<T>;
-    };
-
-    static void destroy_event(event_ptr event)
-    {
-        delete event;
-    }
-
-    static inline unsigned int get_tid()
-    {
-#if defined(__linux__)
-        return syscall(SYS_gettid);
-#elif defined(_WIN64)
-        return GetCurrentThreadId();
-#else
-#error "Only support Windows and Linux."
-#endif
-    }
-
-    namespace detail
-    {
-        static void get_version(const sycl::device &dev, int &major, int &minor)
-        {
-            // Version string has the following format:
-            // a. OpenCL<space><major.minor><space><vendor-specific-information>
-            // b. <major.minor>
-            std::string ver;
-            ver = dev.get_info<sycl::info::device::version>();
-            std::string::size_type i = 0;
-            while (i < ver.size())
-            {
-                if (isdigit(ver[i]))
-                    break;
-                i++;
-            }
-            major = std::stoi(&(ver[i]));
-            while (i < ver.size())
-            {
-                if (ver[i] == '.')
-                    break;
-                i++;
-            }
-            i++;
-            minor = std::stoi(&(ver[i]));
-        }
-
-        template <typename tag, typename T>
-        class generic_error_type
-        {
-        public:
-            generic_error_type() = default;
-            generic_error_type(T value) : value{value} {}
-            operator T() const { return value; }
-
-        private:
-            T value;
-        };
-
-    } // namespace detail
-
-    /// Pitched 2D/3D memory data.
-    class pitched_data
-    {
-    public:
-        pitched_data() : pitched_data(nullptr, 0, 0, 0) {}
-        pitched_data(void *data, size_t pitch, size_t x, size_t y)
-            : _data(data), _pitch(pitch), _x(x), _y(y) {}
-
-        void *get_data_ptr() { return _data; }
-        void set_data_ptr(void *data) { _data = data; }
-
-        size_t get_pitch() { return _pitch; }
-        void set_pitch(size_t pitch) { _pitch = pitch; }
-
-        size_t get_x() { return _x; }
-        void set_x(size_t x) { _x = x; };
-
-        size_t get_y() { return _y; }
-        void set_y(size_t y) { _y = y; }
-
-    private:
-        void *_data;
-        size_t _pitch, _x, _y;
-    };
-
-    class device_info
-    {
-    public:
-        // get interface
-        const char *get_name() const { return _name; }
-        char *get_name() { return _name; }
-        template <typename WorkItemSizesTy = sycl::range<3>,
-                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                       std::is_same_v<WorkItemSizesTy, int *>,
-                                   int> = 0>
-        auto get_max_work_item_sizes() const
-        {
-            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-                return sycl::range<3>(_max_work_item_sizes_i[0],
-                                      _max_work_item_sizes_i[1],
-                                      _max_work_item_sizes_i[2]);
-            else
-            {
-                return _max_work_item_sizes_i;
-            }
-        }
-        template <typename WorkItemSizesTy = sycl::range<3>,
-                  std::enable_if_t<std::is_same_v<WorkItemSizesTy, sycl::range<3>> ||
-                                       std::is_same_v<WorkItemSizesTy, int *>,
-                                   int> = 0>
-        auto get_max_work_item_sizes()
-        {
-            if constexpr (std::is_same_v<WorkItemSizesTy, sycl::range<3>>)
-                return sycl::range<3>(_max_work_item_sizes_i[0],
-                                      _max_work_item_sizes_i[1],
-                                      _max_work_item_sizes_i[2]);
-            else
-            {
-                return _max_work_item_sizes_i;
-            }
-        }
-        bool get_host_unified_memory() const { return _host_unified_memory; }
-        int get_major_version() const { return _major; }
-        int get_minor_version() const { return _minor; }
-        int get_integrated() const { return _integrated; }
-        int get_max_clock_frequency() const { return _frequency; }
-        int get_max_compute_units() const { return _max_compute_units; }
-        int get_max_work_group_size() const { return _max_work_group_size; }
-        int get_max_sub_group_size() const { return _max_sub_group_size; }
-        int get_max_work_items_per_compute_unit() const
-        {
-            return _max_work_items_per_compute_unit;
-        }
-        int get_max_register_size_per_work_group() const
-        {
-            return _max_register_size_per_work_group;
-        }
-        template <typename NDRangeSizeTy = size_t *,
-                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                       std::is_same_v<NDRangeSizeTy, int *>,
-                                   int> = 0>
-        auto get_max_nd_range_size() const
-        {
-            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-                return _max_nd_range_size;
-            else
-                return _max_nd_range_size_i;
-        }
-        template <typename NDRangeSizeTy = size_t *,
-                  std::enable_if_t<std::is_same_v<NDRangeSizeTy, size_t *> ||
-                                       std::is_same_v<NDRangeSizeTy, int *>,
-                                   int> = 0>
-        auto get_max_nd_range_size()
-        {
-            if constexpr (std::is_same_v<NDRangeSizeTy, size_t *>)
-                return _max_nd_range_size;
-            else
-                return _max_nd_range_size_i;
-        }
-        size_t get_global_mem_size() const { return _global_mem_size; }
-        size_t get_local_mem_size() const { return _local_mem_size; }
-        /// Returns the maximum clock rate of device's global memory in kHz. If
-        /// compiler does not support this API then returns default value 3200000 kHz.
-        unsigned int get_memory_clock_rate() const { return _memory_clock_rate; }
-        /// Returns the maximum bus width between device and memory in bits. If
-        /// compiler does not support this API then returns default value 64 bits.
-        unsigned int get_memory_bus_width() const { return _memory_bus_width; }
-        uint32_t get_device_id() const { return _device_id; }
-        std::array<unsigned char, 16> get_uuid() const { return _uuid; }
-        /// Returns global memory cache size in bytes.
-        unsigned int get_global_mem_cache_size() const
-        {
-            return _global_mem_cache_size;
-        }
-
-        // set interface
-        void set_name(const char *name)
-        {
-            size_t length = strlen(name);
-            if (length < 256)
-            {
-                std::memcpy(_name, name, length + 1);
-            }
-            else
-            {
-                std::memcpy(_name, name, 255);
-                _name[255] = '\0';
-            }
-        }
-        void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes)
-        {
-            for (int i = 0; i < 3; ++i)
-                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-        }
-        [[deprecated]] void
-        set_max_work_item_sizes(const sycl::id<3> max_work_item_sizes)
-        {
-            for (int i = 0; i < 3; ++i)
-            {
-                _max_work_item_sizes_i[i] = max_work_item_sizes[i];
-            }
-        }
-        void set_host_unified_memory(bool host_unified_memory)
-        {
-            _host_unified_memory = host_unified_memory;
-        }
-        void set_major_version(int major) { _major = major; }
-        void set_minor_version(int minor) { _minor = minor; }
-        void set_integrated(int integrated) { _integrated = integrated; }
-        void set_max_clock_frequency(int frequency) { _frequency = frequency; }
-        void set_max_compute_units(int max_compute_units)
-        {
-            _max_compute_units = max_compute_units;
-        }
-        void set_global_mem_size(size_t global_mem_size)
-        {
-            _global_mem_size = global_mem_size;
-        }
-        void set_local_mem_size(size_t local_mem_size)
-        {
-            _local_mem_size = local_mem_size;
-        }
-        void set_max_work_group_size(int max_work_group_size)
-        {
-            _max_work_group_size = max_work_group_size;
-        }
-        void set_max_sub_group_size(int max_sub_group_size)
-        {
-            _max_sub_group_size = max_sub_group_size;
-        }
-        void
-        set_max_work_items_per_compute_unit(int max_work_items_per_compute_unit)
-        {
-            _max_work_items_per_compute_unit = max_work_items_per_compute_unit;
-        }
-        void set_max_nd_range_size(int max_nd_range_size[])
-        {
-            for (int i = 0; i < 3; i++)
-            {
-                _max_nd_range_size[i] = max_nd_range_size[i];
-                _max_nd_range_size_i[i] = max_nd_range_size[i];
-            }
-        }
-        void set_memory_clock_rate(unsigned int memory_clock_rate)
-        {
-            _memory_clock_rate = memory_clock_rate;
-        }
-        void set_memory_bus_width(unsigned int memory_bus_width)
-        {
-            _memory_bus_width = memory_bus_width;
-        }
-        void
-        set_max_register_size_per_work_group(int max_register_size_per_work_group)
-        {
-            _max_register_size_per_work_group = max_register_size_per_work_group;
-        }
-        void set_device_id(uint32_t device_id)
-        {
-            _device_id = device_id;
-        }
-        void set_uuid(std::array<unsigned char, 16> uuid)
-        {
-            _uuid = std::move(uuid);
-        }
-        void set_global_mem_cache_size(unsigned int global_mem_cache_size)
-        {
-            _global_mem_cache_size = global_mem_cache_size;
-        }
-
-    private:
-        char _name[256];
-        int _max_work_item_sizes_i[3];
-        bool _host_unified_memory = false;
-        int _major;
-        int _minor;
-        int _integrated = 0;
-        int _frequency;
-        // Set estimated value 3200000 kHz as default value.
-        unsigned int _memory_clock_rate = 3200000;
-        // Set estimated value 64 bits as default value.
-        unsigned int _memory_bus_width = 64;
-        unsigned int _global_mem_cache_size;
-        int _max_compute_units;
-        int _max_work_group_size;
-        int _max_sub_group_size;
-        int _max_work_items_per_compute_unit;
-        int _max_register_size_per_work_group;
-        size_t _global_mem_size;
-        size_t _local_mem_size;
-        size_t _max_nd_range_size[3];
-        int _max_nd_range_size_i[3];
-        uint32_t _device_id;
-        std::array<unsigned char, 16> _uuid;
-    };
-
-    static int get_major_version(const sycl::device &dev)
-    {
-        int major, minor;
-        detail::get_version(dev, major, minor);
-        return major;
-    }
-
-    static int get_minor_version(const sycl::device &dev)
-    {
-        int major, minor;
-        detail::get_version(dev, major, minor);
-        return minor;
-    }
-
-    static void get_device_info(device_info &out, const sycl::device &dev)
-    {
-        device_info prop;
-        prop.set_name(dev.get_info<sycl::info::device::name>().c_str());
-
-        int major, minor;
-        detail::get_version(dev, major, minor);
-        prop.set_major_version(major);
-        prop.set_minor_version(minor);
-
-        prop.set_max_work_item_sizes(
-#if (__SYCL_COMPILER_VERSION && __SYCL_COMPILER_VERSION < 20220902)
-            // oneAPI DPC++ compiler older than 2022/09/02, where max_work_item_sizes
-            // is an enum class element
-            dev.get_info<sycl::info::device::max_work_item_sizes>());
-#else
-            // SYCL 2020-conformant code, max_work_item_sizes is a struct templated by
-            // an int
-            dev.get_info<sycl::info::device::max_work_item_sizes<3>>());
-#endif
-        prop.set_host_unified_memory(dev.has(sycl::aspect::usm_host_allocations));
-
-        prop.set_max_clock_frequency(
-            dev.get_info<sycl::info::device::max_clock_frequency>() * 1000);
-
-        prop.set_max_compute_units(
-            dev.get_info<sycl::info::device::max_compute_units>());
-        prop.set_max_work_group_size(
-            dev.get_info<sycl::info::device::max_work_group_size>());
-        prop.set_global_mem_size(dev.get_info<sycl::info::device::global_mem_size>());
-        prop.set_local_mem_size(dev.get_info<sycl::info::device::local_mem_size>());
-
-#if (defined(SYCL_EXT_INTEL_DEVICE_INFO) && SYCL_EXT_INTEL_DEVICE_INFO >= 6)
-        if (dev.has(sycl::aspect::ext_intel_memory_clock_rate))
-        {
-            unsigned int tmp =
-                dev.get_info<sycl::ext::intel::info::device::memory_clock_rate>();
-            if (tmp != 0)
-                prop.set_memory_clock_rate(1000 * tmp);
-        }
-        if (dev.has(sycl::aspect::ext_intel_memory_bus_width))
-        {
-            prop.set_memory_bus_width(
-                dev.get_info<sycl::ext::intel::info::device::memory_bus_width>());
-        }
-        if (dev.has(sycl::aspect::ext_intel_device_id))
-        {
-            prop.set_device_id(
-                dev.get_info<sycl::ext::intel::info::device::device_id>());
-        }
-        if (dev.has(sycl::aspect::ext_intel_device_info_uuid))
-        {
-            prop.set_uuid(dev.get_info<sycl::ext::intel::info::device::uuid>());
-        }
-#elif defined(_MSC_VER) && !defined(__clang__)
-#pragma message("get_device_info: querying memory_clock_rate and \
-        memory_bus_width are not supported by the compiler used. \
-        Use 3200000 kHz as memory_clock_rate default value. \
-        Use 64 bits as memory_bus_width default value.")
-#else
-#warning "get_device_info: querying memory_clock_rate and \
-        memory_bus_width are not supported by the compiler used. \
-        Use 3200000 kHz as memory_clock_rate default value. \
-        Use 64 bits as memory_bus_width default value."
-#endif
-
-        size_t max_sub_group_size = 1;
-        std::vector<size_t> sub_group_sizes =
-            dev.get_info<sycl::info::device::sub_group_sizes>();
-
-        for (const auto &sub_group_size : sub_group_sizes)
-        {
-            if (max_sub_group_size < sub_group_size)
-                max_sub_group_size = sub_group_size;
-        }
-
-        prop.set_max_sub_group_size(max_sub_group_size);
-
-        prop.set_max_work_items_per_compute_unit(
-            dev.get_info<sycl::info::device::max_work_group_size>());
-        int max_nd_range_size[] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
-        prop.set_max_nd_range_size(max_nd_range_size);
-
-        // Estimates max register size per work group, feel free to update the value
-        // according to device properties.
-        prop.set_max_register_size_per_work_group(65536);
-
-        prop.set_global_mem_cache_size(
-            dev.get_info<sycl::info::device::global_mem_cache_size>());
-        out = prop;
-    }
-
-    /// dpct device extension
-    class device_ext : public sycl::device
-    {
-        typedef std::mutex mutex_type;
-
-    public:
-        device_ext() : sycl::device(), _ctx(*this) {}
-        ~device_ext()
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            clear_queues();
-        }
-        device_ext(const sycl::device &base) : sycl::device(base), _ctx(*this)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            init_queues();
-        }
-
-        int is_native_atomic_supported() { return 0; }
-        int get_major_version() const
-        {
-            return dpct::get_major_version(*this);
-        }
-
-        int get_minor_version() const
-        {
-            return dpct::get_minor_version(*this);
-        }
-
-        int get_max_compute_units() const
-        {
-            return get_device_info().get_max_compute_units();
-        }
-
-        /// Return the maximum clock frequency of this device in KHz.
-        int get_max_clock_frequency() const
-        {
-            return get_device_info().get_max_clock_frequency();
-        }
-
-        int get_integrated() const { return get_device_info().get_integrated(); }
-
-        int get_max_sub_group_size() const
-        {
-            return get_device_info().get_max_sub_group_size();
-        }
-
-        int get_max_register_size_per_work_group() const
-        {
-            return get_device_info().get_max_register_size_per_work_group();
-        }
-
-        int get_max_work_group_size() const
-        {
-            return get_device_info().get_max_work_group_size();
-        }
-
-        int get_mem_base_addr_align() const
-        {
-            return get_info<sycl::info::device::mem_base_addr_align>();
-        }
-
-        size_t get_global_mem_size() const
-        {
-            return get_device_info().get_global_mem_size();
-        }
-
-        /// Get the number of bytes of free and total memory on the SYCL device.
-        /// \param [out] free_memory The number of bytes of free memory on the SYCL device.
-        /// \param [out] total_memory The number of bytes of total memory on the SYCL device.
-        void get_memory_info(size_t &free_memory, size_t &total_memory)
-        {
-#if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105)
-            if (!has(sycl::aspect::ext_intel_free_memory))
-            {
-                std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-                free_memory = 0;
-            }
-            else
-            {
-                free_memory = get_info<sycl::ext::intel::info::device::free_memory>();
-            }
-#else
-            std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl;
-            free_memory = 0;
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma message("Querying the number of bytes of free memory is not supported")
-#else
-#warning "Querying the number of bytes of free memory is not supported"
-#endif
-#endif
-            total_memory = get_device_info().get_global_mem_size();
-        }
-
-        void get_device_info(device_info &out) const
-        {
-            dpct::get_device_info(out, *this);
-        }
-
-        device_info get_device_info() const
-        {
-            device_info prop;
-            dpct::get_device_info(prop, *this);
-            return prop;
-        }
-
-        void reset()
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            clear_queues();
-            init_queues();
-        }
-
-        sycl::queue &in_order_queue() { return *_q_in_order; }
-
-        sycl::queue &out_of_order_queue() { return *_q_out_of_order; }
-
-        sycl::queue &default_queue()
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return out_of_order_queue();
-#else
-            return in_order_queue();
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-        void queues_wait_and_throw()
-        {
-            std::unique_lock<mutex_type> lock(m_mutex);
-            std::vector<std::shared_ptr<sycl::queue>> current_queues(
-                _queues);
-            lock.unlock();
-            for (const auto &q : current_queues)
-            {
-                q->wait_and_throw();
-            }
-            // Guard the destruct of current_queues to make sure the ref count is safe.
-            lock.lock();
-        }
-
-        sycl::queue *create_queue(bool enable_exception_handler = false)
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return create_out_of_order_queue(enable_exception_handler);
-#else
-            return create_in_order_queue(enable_exception_handler);
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-        sycl::queue *create_in_order_queue(bool enable_exception_handler = false)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return create_queue_impl(enable_exception_handler,
-                                     sycl::property::queue::in_order());
-        }
-
-        sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return create_queue_impl(enable_exception_handler);
-        }
-
-        void destroy_queue(sycl::queue *&queue)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            _queues.erase(std::remove_if(_queues.begin(), _queues.end(),
-                                         [=](const std::shared_ptr<sycl::queue> &q) -> bool
-                                         {
-                                             return q.get() == queue;
-                                         }),
-                          _queues.end());
-            queue = nullptr;
-        }
-        void set_saved_queue(sycl::queue *q)
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            _saved_queue = q;
-        }
-        sycl::queue *get_saved_queue() const
-        {
-            std::lock_guard<mutex_type> lock(m_mutex);
-            return _saved_queue;
-        }
-        sycl::context get_context() const { return _ctx; }
-
-    private:
-        void clear_queues()
-        {
-            _queues.clear();
-            _q_in_order = _q_out_of_order = _saved_queue = nullptr;
-        }
-
-        void init_queues()
-        {
-            _q_in_order = create_queue_impl(true, sycl::property::queue::in_order());
-            _q_out_of_order = create_queue_impl(true);
-            _saved_queue = &default_queue();
-        }
-
-        /// Caller should acquire resource \p m_mutex before calling this function.
-        template <class... Properties>
-        sycl::queue *create_queue_impl(bool enable_exception_handler,
-                                       Properties... properties)
-        {
-            sycl::async_handler eh = {};
-            if (enable_exception_handler)
-            {
-                eh = exception_handler;
-            }
-            _queues.push_back(std::make_shared<sycl::queue>(
-                _ctx, *this, eh,
-                sycl::property_list(
-#ifdef DPCT_PROFILING_ENABLED
-                    sycl::property::queue::enable_profiling(),
-#endif
-                    properties...)));
-
-            return _queues.back().get();
-        }
-
-        void get_version(int &major, int &minor) const
-        {
-            detail::get_version(*this, major, minor);
-        }
-        sycl::queue *_q_in_order, *_q_out_of_order;
-        sycl::queue *_saved_queue;
-        sycl::context _ctx;
-        std::vector<std::shared_ptr<sycl::queue>> _queues;
-        mutable mutex_type m_mutex;
-    };
-
-    /// device manager
-    class dev_mgr
-    {
-    public:
-        device_ext &current_device()
-        {
-            unsigned int dev_id = current_device_id();
-            check_id(dev_id);
-            return *_devs[dev_id];
-        }
-        device_ext &cpu_device() const
-        {
-            std::lock_guard<std::recursive_mutex> lock(m_mutex);
-            if (_cpu_device == -1)
-            {
-                throw std::runtime_error("no valid cpu device");
-            }
-            else
-            {
-                return *_devs[_cpu_device];
-            }
-        }
-        device_ext &get_device(unsigned int id) const
-        {
-            std::lock_guard<std::recursive_mutex> lock(m_mutex);
-            check_id(id);
-            return *_devs[id];
-        }
-        unsigned int current_device_id() const
-        {
-            std::lock_guard<std::recursive_mutex> lock(m_mutex);
-            auto it = _thread2dev_map.find(get_tid());
-            if (it != _thread2dev_map.end())
-                return it->second;
-            return DEFAULT_DEVICE_ID;
-        }
-
-        /// Select device with a device ID.
-        /// \param [in] id The id of the device which can
-        /// be obtained through get_device_id(const sycl::device).
-        void select_device(unsigned int id)
-        {
-            std::lock_guard<std::recursive_mutex> lock(m_mutex);
-            check_id(id);
-            _thread2dev_map[get_tid()] = id;
-        }
-        unsigned int device_count() { return _devs.size(); }
-
-        unsigned int get_device_id(const sycl::device &dev)
-        {
-            unsigned int id = 0;
-            for (auto dev_item : _devs)
-            {
-                if (*dev_item == dev)
-                {
-                    break;
-                }
-                id++;
-            }
-            return id;
-        }
-
-        template <class DeviceSelector>
-        std::enable_if_t<
-            std::is_invocable_r_v<int, DeviceSelector, const sycl::device &>>
-        select_device(const DeviceSelector &selector = sycl::gpu_selector_v)
-        {
-            sycl::device selected_device = sycl::device(selector);
-            unsigned int selected_device_id = get_device_id(selected_device);
-            select_device(selected_device_id);
-        }
-
-        /// Returns the instance of device manager singleton.
-        static dev_mgr &instance()
-        {
-            static dev_mgr d_m;
-            return d_m;
-        }
-        dev_mgr(const dev_mgr &) = delete;
-        dev_mgr &operator=(const dev_mgr &) = delete;
-        dev_mgr(dev_mgr &&) = delete;
-        dev_mgr &operator=(dev_mgr &&) = delete;
-
-    private:
-        mutable std::recursive_mutex m_mutex;
-        dev_mgr()
-        {
-            sycl::device default_device =
-                sycl::device(sycl::default_selector_v);
-            _devs.push_back(std::make_shared<device_ext>(default_device));
-
-            std::vector<sycl::device> sycl_all_devs =
-                sycl::device::get_devices(sycl::info::device_type::all);
-            // Collect other devices except for the default device.
-            if (default_device.is_cpu())
-                _cpu_device = 0;
-            for (auto &dev : sycl_all_devs)
-            {
-                if (dev == default_device)
-                {
-                    continue;
-                }
-                _devs.push_back(std::make_shared<device_ext>(dev));
-                if (_cpu_device == -1 && dev.is_cpu())
-                {
-                    _cpu_device = _devs.size() - 1;
-                }
-            }
-        }
-        void check_id(unsigned int id) const
-        {
-            if (id >= _devs.size())
-            {
-                throw std::runtime_error("invalid device id");
-            }
-        }
-        std::vector<std::shared_ptr<device_ext>> _devs;
-        /// DEFAULT_DEVICE_ID is used, if current_device_id() can not find current
-        /// thread id in _thread2dev_map, which means default device should be used
-        /// for the current thread.
-        const unsigned int DEFAULT_DEVICE_ID = 0;
-        /// thread-id to device-id map.
-        std::map<unsigned int, unsigned int> _thread2dev_map;
-        int _cpu_device = -1;
-    };
-
-    static inline sycl::queue &get_default_queue()
-    {
-        return dev_mgr::instance().current_device().default_queue();
-    }
-
-    namespace detail
-    {
-        enum class pointer_access_attribute
-        {
-            host_only = 0,
-            device_only,
-            host_device,
-            end
-        };
-
-        static pointer_access_attribute get_pointer_attribute(sycl::queue &q,
-                                                              const void *ptr)
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return mem_mgr::instance().is_device_ptr(ptr)
-                       ? pointer_access_attribute::device_only
-                       : pointer_access_attribute::host_only;
-#else
-            switch (sycl::get_pointer_type(ptr, q.get_context()))
-            {
-            case sycl::usm::alloc::unknown:
-                return pointer_access_attribute::host_only;
-            case sycl::usm::alloc::device:
-                return pointer_access_attribute::device_only;
-            case sycl::usm::alloc::shared:
-            case sycl::usm::alloc::host:
-                return pointer_access_attribute::host_device;
-            }
-#endif
-        }
-
-        template <typename ArgT>
-        inline constexpr std::uint64_t get_type_combination_id(ArgT Val)
-        {
-            static_assert((unsigned char)library_data_t::library_data_t_size <=
-                              std::numeric_limits<unsigned char>::max() &&
-                          "library_data_t size exceeds limit.");
-            static_assert(std::is_same_v<ArgT, library_data_t>, "Unsupported ArgT");
-            return (std::uint64_t)Val;
-        }
-
-        template <typename FirstT, typename... RestT>
-        inline constexpr std::uint64_t get_type_combination_id(FirstT FirstVal,
-                                                               RestT... RestVal)
-        {
-            static_assert((std::uint8_t)library_data_t::library_data_t_size <=
-                              std::numeric_limits<unsigned char>::max() &&
-                          "library_data_t size exceeds limit.");
-            static_assert(sizeof...(RestT) <= 8 && "Too many parameters");
-            static_assert(std::is_same_v<FirstT, library_data_t>, "Unsupported FirstT");
-            return get_type_combination_id(RestVal...) << 8 | ((std::uint64_t)FirstVal);
-        }
-
-        class mem_mgr
-        {
-            mem_mgr()
-            {
-                // Reserved address space, no real memory allocation happens here.
-#if defined(__linux__)
-                mapped_address_space =
-                    (byte_t *)mmap(nullptr, mapped_region_size, PROT_NONE,
-                                   MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-#elif defined(_WIN64)
-                mapped_address_space = (byte_t *)VirtualAlloc(
-                    NULL,               // NULL specified as the base address parameter
-                    mapped_region_size, // Size of allocation
-                    MEM_RESERVE,        // Allocate reserved pages
-                    PAGE_NOACCESS);     // Protection = no access
-#else
-#error "Only support Windows and Linux."
-#endif
-                next_free = mapped_address_space;
-            };
-
-        public:
-            using buffer_id_t = int;
-
-            struct allocation
-            {
-                buffer_t buffer;
-                byte_t *alloc_ptr;
-                size_t size;
-            };
-
-            ~mem_mgr()
-            {
-#if defined(__linux__)
-                munmap(mapped_address_space, mapped_region_size);
-#elif defined(_WIN64)
-                VirtualFree(mapped_address_space, 0, MEM_RELEASE);
-#else
-#error "Only support Windows and Linux."
-#endif
-            };
-
-            mem_mgr(const mem_mgr &) = delete;
-            mem_mgr &operator=(const mem_mgr &) = delete;
-            mem_mgr(mem_mgr &&) = delete;
-            mem_mgr &operator=(mem_mgr &&) = delete;
-
-            /// Allocate
-            void *mem_alloc(size_t size)
-            {
-                if (!size)
-                    return nullptr;
-                std::lock_guard<std::mutex> lock(m_mutex);
-                if (next_free + size > mapped_address_space + mapped_region_size)
-                {
-                    throw std::runtime_error("dpct_malloc: out of memory for virtual memory pool");
-                }
-                // Allocation
-                sycl::range<1> r(size);
-                buffer_t buf(r);
-                allocation A{buf, next_free, size};
-                // Map allocation to device pointer
-                void *result = next_free;
-                m_map.emplace(next_free + size, A);
-                // Update pointer to the next free space.
-                next_free += (size + extra_padding + alignment - 1) & ~(alignment - 1);
-
-                return result;
-            }
-
-            /// Deallocate
-            void mem_free(const void *ptr)
-            {
-                if (!ptr)
-                    return;
-                std::lock_guard<std::mutex> lock(m_mutex);
-                auto it = get_map_iterator(ptr);
-                m_map.erase(it);
-            }
-
-            /// map: device pointer -> allocation(buffer, alloc_ptr, size)
-            allocation translate_ptr(const void *ptr)
-            {
-                std::lock_guard<std::mutex> lock(m_mutex);
-                auto it = get_map_iterator(ptr);
-                return it->second;
-            }
-
-            /// Check if the pointer represents device pointer or not.
-            bool is_device_ptr(const void *ptr) const
-            {
-                std::lock_guard<std::mutex> lock(m_mutex);
-                return (mapped_address_space <= ptr) &&
-                       (ptr < mapped_address_space + mapped_region_size);
-            }
-
-            /// Returns the instance of memory manager singleton.
-            static mem_mgr &instance()
-            {
-                static mem_mgr m;
-                return m;
-            }
-
-        private:
-            std::map<byte_t *, allocation> m_map;
-            mutable std::mutex m_mutex;
-            byte_t *mapped_address_space;
-            byte_t *next_free;
-            const size_t mapped_region_size = 128ull * 1024 * 1024 * 1024;
-            const size_t alignment = 256;
-            /// This padding may be defined to some positive value to debug
-            /// out of bound accesses.
-            const size_t extra_padding = 0;
-
-            std::map<byte_t *, allocation>::iterator get_map_iterator(const void *ptr)
-            {
-                auto it = m_map.upper_bound((byte_t *)ptr);
-                if (it == m_map.end())
-                {
-                    // Not a virtual pointer.
-                    throw std::runtime_error("can not get buffer from non-virtual pointer");
-                }
-                const allocation &alloc = it->second;
-                if (ptr < alloc.alloc_ptr)
-                {
-                    // Out of bound.
-                    // This may happen if there's a gap between allocations due to alignment
-                    // or extra padding and pointer points to this gap.
-                    throw std::runtime_error("invalid virtual pointer");
-                }
-                return it;
-            }
-        };
-
-        template <class T, memory_region Memory, size_t Dimension>
-        class accessor;
-        template <memory_region Memory, class T = byte_t>
-        class memory_traits
-        {
-        public:
-            static constexpr sycl::access::target target =
-                sycl::access::target::device;
-            static constexpr sycl::access_mode mode =
-                (Memory == constant) ? sycl::access_mode::read
-                                     : sycl::access_mode::read_write;
-            static constexpr size_t type_size = sizeof(T);
-            using element_t =
-                typename std::conditional<Memory == constant, const T, T>::type;
-            using value_t = typename std::remove_cv<T>::type;
-            template <size_t Dimension = 1>
-            using accessor_t = typename std::conditional<
-                Memory == local, sycl::local_accessor<value_t, Dimension>,
-                sycl::accessor<T, Dimension, mode, target>>::type;
-            using pointer_t = T *;
-        };
-
-        static inline void *dpct_malloc(size_t size, sycl::queue &q)
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            return mem_mgr::instance().mem_alloc(size * sizeof(byte_t));
-#else
-            return sycl::malloc_device(size, q.get_device(), q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-#define PITCH_DEFAULT_ALIGN(x) (((x) + 31) & ~(0x1F))
-        static inline void *dpct_malloc(size_t &pitch, size_t x, size_t y, size_t z,
-                                        sycl::queue &q)
-        {
-            pitch = PITCH_DEFAULT_ALIGN(x);
-            return dpct_malloc(pitch * y * z, q);
-        }
-
-        /**
-         * @brief Sets \p value to the first \p size elements starting from \p dev_ptr in \p q.
-         * @tparam valueT The type of the element to be set.
-         * @param [in] q The queue in which the operation is done.
-         * @param [in] dev_ptr Pointer to the virtual device memory address.
-         * @param [in] value The value to be set.
-         * @param [in] size Number of elements to be set to the value.
-         * @return An event representing the memset operation.
-         */
-        template <typename valueT>
-        static inline sycl::event dpct_memset(sycl::queue &q, void *dev_ptr,
-                                              valueT value, size_t size)
-        {
-#ifdef DPCT_USM_LEVEL_NONE
-            auto &mm = mem_mgr::instance();
-            assert(mm.is_device_ptr(dev_ptr));
-            auto alloc = mm.translate_ptr(dev_ptr);
-            size_t offset = (valueT *)dev_ptr - (valueT *)alloc.alloc_ptr;
-
-            return q.submit([&](sycl::handler &cgh)
-                            {
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    auto new_buffer = alloc.buffer.reinterpret<valueT>(
-        sycl::range<1>(alloc.size / sizeof(valueT)));
-    sycl::accessor<valueT, 1, sycl::access_mode::write,
-                sycl::access::target::device>
-        acc(new_buffer, cgh, r, o);
-    cgh.fill(acc, value); });
-#else
-            return q.fill(dev_ptr, value, size);
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-        /**
-         * @brief Sets \p value to the 3D memory region pointed by \p data in \p q.
-         * @tparam valueT The type of the element to be set.
-         * @param [in] q The queue in which the operation is done.
-         * @param [in] data Pointer to the pitched device memory region.
-         * @param [in] value The value to be set.
-         * @param [in] size 3D memory region by number of elements.
-         * @return An event list representing the memset operations.
-         */
-        template <typename valueT>
-        static inline std::vector<sycl::event>
-        dpct_memset(sycl::queue &q, pitched_data data, valueT value,
-                    sycl::range<3> size)
-        {
-            std::vector<sycl::event> event_list;
-            size_t slice = data.get_pitch() * data.get_y();
-            unsigned char *data_surface = (unsigned char *)data.get_data_ptr();
-            for (size_t z = 0; z < size.get(2); ++z)
-            {
-                unsigned char *data_ptr = data_surface;
-                for (size_t y = 0; y < size.get(1); ++y)
-                {
-                    event_list.push_back(dpct_memset(q, data_ptr, value, size.get(0)));
-                    data_ptr += data.get_pitch();
-                }
-                data_surface += slice;
-            }
-            return event_list;
-        }
-
-        /**
-         * @brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p q.
-         * @tparam valueT The type of the element to be set.
-         * @param [in] q The queue in which the operation is done.
-         * @param [in] ptr Pointer to the virtual device memory.
-         * @param [in] pitch The pitch size by number of elements, including padding.
-         * @param [in] val The value to be set.
-         * @param [in] x The width of memory region by number of elements.
-         * @param [in] y The height of memory region by number of elements.
-         * @return An event list representing the memset operations.
-         */
-        template <typename valueT>
-        static inline std::vector<sycl::event>
-        dpct_memset(sycl::queue &q, void *ptr, size_t pitch, valueT val, size_t x,
-                    size_t y)
-        {
-            return dpct_memset(q, pitched_data(ptr, pitch, x, 1), val,
-                               sycl::range<3>(x, y, 1));
-        }
-
-        static memcpy_direction deduce_memcpy_direction(sycl::queue &q, void *to_ptr,
-                                                        const void *from_ptr,
-                                                        memcpy_direction dir)
-        {
-            switch (dir)
-            {
-            case memcpy_direction::host_to_host:
-            case memcpy_direction::host_to_device:
-            case memcpy_direction::device_to_host:
-            case memcpy_direction::device_to_device:
-                return dir;
-            case memcpy_direction::automatic:
-            {
-                // table[to_attribute][from_attribute]
-                static const memcpy_direction
-                    direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
-                                   [static_cast<unsigned>(pointer_access_attribute::end)] =
-                                       {{memcpy_direction::host_to_host,
-                                         memcpy_direction::device_to_host,
-                                         memcpy_direction::host_to_host},
-                                        {memcpy_direction::host_to_device,
-                                         memcpy_direction::device_to_device,
-                                         memcpy_direction::device_to_device},
-                                        {memcpy_direction::host_to_host,
-                                         memcpy_direction::device_to_device,
-                                         memcpy_direction::device_to_device}};
-                return direction_table[static_cast<unsigned>(get_pointer_attribute(
-                    q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
-            }
-            default:
-                throw std::runtime_error("dpct_memcpy: invalid direction value");
-            }
-        }
-
-        static sycl::event
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-                    memcpy_direction direction,
-                    const std::vector<sycl::event> &dep_events = {})
-        {
-            if (!size)
-                return sycl::event{};
-#ifdef DPCT_USM_LEVEL_NONE
-            auto &mm = mem_mgr::instance();
-            auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-
-            switch (real_direction)
-            {
-            case host_to_host:
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
-            case host_to_device:
-            {
-                auto alloc = mm.translate_ptr(to_ptr);
-                size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                        sycl::access::target::device>
-        acc(alloc.buffer, cgh, r, o);
-    cgh.copy(from_ptr, acc); });
-            }
-            case device_to_host:
-            {
-                auto alloc = mm.translate_ptr(from_ptr);
-                size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    auto r = sycl::range<1>(size);
-    auto o = sycl::id<1>(offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                        sycl::access::target::device>
-        acc(alloc.buffer, cgh, r, o);
-    cgh.copy(acc, to_ptr); });
-            }
-            case device_to_device:
-            {
-                auto to_alloc = mm.translate_ptr(to_ptr);
-                auto from_alloc = mm.translate_ptr(from_ptr);
-                size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-                size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-                return q.submit([&](sycl::handler &cgh)
-                                {
-    cgh.depends_on(dep_events);
-    auto r = sycl::range<1>(size);
-    auto to_o = sycl::id<1>(to_offset);
-    auto from_o = sycl::id<1>(from_offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                        sycl::access::target::device>
-        to_acc(to_alloc.buffer, cgh, r, to_o);
-    sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                        sycl::access::target::device>
-        from_acc(from_alloc.buffer, cgh, r, from_o);
-    cgh.copy(from_acc, to_acc); });
-            }
-            default:
-                throw std::runtime_error("dpct_memcpy: invalid direction value");
-            }
-#else
-            return q.memcpy(to_ptr, from_ptr, size, dep_events);
-#endif // DPCT_USM_LEVEL_NONE
-        }
-
-        // Get actual copy range and make sure it will not exceed range.
-        static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                            size_t pitch)
-        {
-            return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
-        }
-
-        static inline size_t get_offset(sycl::id<3> id, size_t slice,
-                                        size_t pitch)
-        {
-            return slice * id.get(2) + pitch * id.get(1) + id.get(0);
-        }
-
-        /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
-        /// and \p from_range to another specified by \p to_ptr and \p to_range.
-        static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                    sycl::range<3> to_range, sycl::range<3> from_range,
-                    sycl::id<3> to_id, sycl::id<3> from_id,
-                    sycl::range<3> size, memcpy_direction direction,
-                    const std::vector<sycl::event> &dep_events = {})
-        {
-            // RAII for host pointer
-            class host_buffer
-            {
-                void *_buf;
-                size_t _size;
-                sycl::queue &_q;
-                const std::vector<sycl::event> &_deps; // free operation depends
-
-            public:
-                host_buffer(size_t size, sycl::queue &q,
-                            const std::vector<sycl::event> &deps)
-                    : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-                void *get_ptr() const { return _buf; }
-                size_t get_size() const { return _size; }
-                ~host_buffer()
-                {
-                    if (_buf)
-                    {
-                        _q.submit([&](sycl::handler &cgh)
-                                  {
-        cgh.depends_on(_deps);
-        cgh.host_task([buf = _buf] { std::free(buf); }); });
-                    }
-                }
-            };
-            std::vector<sycl::event> event_list;
-
-            size_t to_slice = to_range.get(1) * to_range.get(0),
-                   from_slice = from_range.get(1) * from_range.get(0);
-            unsigned char *to_surface =
-                (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-            const unsigned char *from_surface =
-                (const unsigned char *)from_ptr +
-                get_offset(from_id, from_slice, from_range.get(0));
-
-            if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
-            {
-                return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                                    direction, dep_events)};
-            }
-            direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-            size_t size_slice = size.get(1) * size.get(0);
-            switch (direction)
-            {
-            case host_to_host:
-                for (size_t z = 0; z < size.get(2); ++z)
-                {
-                    unsigned char *to_ptr = to_surface;
-                    const unsigned char *from_ptr = from_surface;
-                    if (to_range.get(0) == from_range.get(0) &&
-                        to_range.get(0) == size.get(0))
-                    {
-                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                                                         direction, dep_events));
-                    }
-                    else
-                    {
-                        for (size_t y = 0; y < size.get(1); ++y)
-                        {
-                            event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                                             direction, dep_events));
-                            to_ptr += to_range.get(0);
-                            from_ptr += from_range.get(0);
-                        }
-                    }
-                    to_surface += to_slice;
-                    from_surface += from_slice;
-                }
-                break;
-            case host_to_device:
-            {
-                host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                                event_list);
-                std::vector<sycl::event> host_events;
-                if (to_slice == size_slice)
-                {
-                    // Copy host data to a temp host buffer with the shape of target.
-                    host_events =
-                        dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                                    host_to_host, dep_events);
-                }
-                else
-                {
-                    // Copy host data to a temp host buffer with the shape of target.
-                    host_events = dpct_memcpy(
-                        q, buf.get_ptr(), from_surface, to_range, from_range,
-                        sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
-                        // If has padding data, not sure whether it is useless. So fill temp
-                        // buffer with it.
-                        std::vector<sycl::event>{
-                            dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                                        device_to_host, dep_events)});
-                }
-                // Copy from temp host buffer to device with only one submit.
-                event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                                                 buf.get_size(), host_to_device,
-                                                 host_events));
-                break;
-            }
-            case device_to_host:
-            {
-                host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                                event_list);
-                // Copy from host temp buffer to host target with reshaping.
-                event_list = dpct_memcpy(
-                    q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
-                    sycl::id<3>(0, 0, 0), size, host_to_host,
-                    // Copy from device to temp host buffer with only one submit.
-                    std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                                                         buf.get_size(),
-                                                         device_to_host, dep_events)});
-                break;
-            }
-            case device_to_device:
-#ifdef DPCT_USM_LEVEL_NONE
-            {
-                auto &mm = mem_mgr::instance();
-                auto to_alloc = mm.translate_ptr(to_surface);
-                auto from_alloc = mm.translate_ptr(from_surface);
-                size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-                size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-                event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                              {
-    cgh.depends_on(dep_events);
-    auto to_o = sycl::id<1>(to_offset);
-    auto from_o = sycl::id<1>(from_offset);
-    sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                        sycl::access::target::device>
-        to_acc(to_alloc.buffer, cgh,
-                get_copy_range(size, to_slice, to_range.get(0)), to_o);
-    sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                        sycl::access::target::device>
-        from_acc(from_alloc.buffer, cgh,
-                get_copy_range(size, from_slice, from_range.get(0)), from_o);
-    cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
-        size,
-        [=](sycl::id<3> id) {
-            to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                from_acc[get_offset(id, from_slice, from_range.get(0))];
-        }); }));
-            }
-#else
-                event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                              {
-    cgh.depends_on(dep_events);
-    cgh.parallel_for<class dpct_memcpy_3d_detail>(
-        size,
-        [=](sycl::id<3> id) {
-            to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                from_surface[get_offset(id, from_slice, from_range.get(0))];
-        }); }));
-#endif
-            break;
-            default:
-                throw std::runtime_error("dpct_memcpy: invalid direction value");
-            }
-            return event_list;
-        }
-
-        /// memcpy 2D/3D matrix specified by pitched_data.
-        static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-                    pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-                    memcpy_direction direction = automatic)
-        {
-            return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                               sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                               sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                               size, direction);
-        }
-
-        /// memcpy 2D matrix with pitch.
-        static inline std::vector<sycl::event>
-        dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                    size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-                    memcpy_direction direction = automatic)
-        {
-            return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                               sycl::range<3>(from_pitch, y, 1),
-                               sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                               sycl::range<3>(x, y, 1), direction);
-        }
-
-        namespace deprecated
-        {
-
-            template <typename T, sycl::usm::alloc AllocKind>
-            class usm_allocator
-            {
-            private:
-                using Alloc = sycl::usm_allocator<T, AllocKind>;
-                Alloc _impl;
-
-            public:
-                using value_type = typename std::allocator_traits<Alloc>::value_type;
-                using pointer = typename std::allocator_traits<Alloc>::pointer;
-                using const_pointer = typename std::allocator_traits<Alloc>::const_pointer;
-                using void_pointer = typename std::allocator_traits<Alloc>::void_pointer;
-                using const_void_pointer =
-                    typename std::allocator_traits<Alloc>::const_void_pointer;
-                using reference = typename std::allocator_traits<Alloc>::value_type &;
-                using const_reference =
-                    const typename std::allocator_traits<Alloc>::value_type &;
-                using difference_type =
-                    typename std::allocator_traits<Alloc>::difference_type;
-                using size_type = typename std::allocator_traits<Alloc>::size_type;
-                using propagate_on_container_copy_assignment = typename std::allocator_traits<
-                    Alloc>::propagate_on_container_copy_assignment;
-                using propagate_on_container_move_assignment = typename std::allocator_traits<
-                    Alloc>::propagate_on_container_move_assignment;
-                using propagate_on_container_swap =
-                    typename std::allocator_traits<Alloc>::propagate_on_container_swap;
-                using is_always_equal =
-                    typename std::allocator_traits<Alloc>::is_always_equal;
-
-                template <typename U>
-                struct rebind
-                {
-                    typedef usm_allocator<U, AllocKind> other;
-                };
-
-                usm_allocator() : _impl(dpct::get_default_queue()) {}
-                ~usm_allocator() {}
-                usm_allocator(const usm_allocator &other) : _impl(other._impl) {}
-                usm_allocator(usm_allocator &&other) : _impl(std::move(other._impl)) {}
-                pointer address(reference r) { return &r; }
-                const_pointer address(const_reference r) { return &r; }
-                pointer allocate(size_type cnt, const_void_pointer hint = nullptr)
-                {
-                    return std::allocator_traits<Alloc>::allocate(_impl, cnt, hint);
-                }
-                void deallocate(pointer p, size_type cnt)
-                {
-                    std::allocator_traits<Alloc>::deallocate(_impl, p, cnt);
-                }
-                size_type max_size() const
-                {
-                    return std::allocator_traits<Alloc>::max_size(_impl);
-                }
-                bool operator==(const usm_allocator &other) const { return _impl == other._impl; }
-                bool operator!=(const usm_allocator &other) const { return _impl != other._impl; }
-            };
-
-        } // namespace deprecated
-
-        inline void dpct_free(void *ptr,
-                              const sycl::queue &q)
-        {
-            if (ptr)
-            {
-#ifdef DPCT_USM_LEVEL_NONE
-                detail::mem_mgr::instance().mem_free(ptr);
-#else
-                sycl::free(ptr, q.get_context());
-#endif // DPCT_USM_LEVEL_NONE
-            }
-        }
-
-        template <typename T>
-        inline auto get_memory(const void *x)
-        {
-            T *new_x = reinterpret_cast<T *>(const_cast<void *>(x));
-#ifdef DPCT_USM_LEVEL_NONE
-            return dpct::get_buffer<std::remove_cv_t<T>>(new_x);
-#else
-            return new_x;
-#endif
-        }
-
-        template <typename T>
-        inline typename DataType<T>::T2 get_value(const T *s, sycl::queue &q)
-        {
-            using Ty = typename DataType<T>::T2;
-            Ty s_h;
-            if (get_pointer_attribute(q, s) == pointer_access_attribute::device_only)
-                detail::dpct_memcpy(q, (void *)&s_h, (void *)s, sizeof(T), device_to_host)
-                    .wait();
-            else
-                s_h = *reinterpret_cast<const Ty *>(s);
-            return s_h;
-        }
-
-    } // namespace detail
-
-    template <typename T>
-    inline auto get_value(const T *s, sycl::queue &q)
-    {
-        return detail::get_value(s, q);
-    }
-
-    namespace detail
-    {
-        template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                              oneapi::mkl::transpose b_trans, int m, int n, int k,
-                              const void *alpha, const void *a, int lda, const void *b,
-                              int ldb, const void *beta, void *c, int ldc)
-        {
-#ifndef __INTEL_MKL__
-            throw std::runtime_error("The oneAPI Math Kernel Library (oneMKL) Interfaces "
-                                     "Project does not support this API.");
-#else
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-            auto data_a = get_memory<const Ta>(a);
-            auto data_b = get_memory<const Tb>(b);
-            auto data_c = get_memory<Tc>(c);
-            oneapi::mkl::blas::column_major::gemm(
-                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-                data_b, ldb, beta_value, data_c, ldc);
-#endif
-        }
-
-        template <typename VecT, class BinaryOperation, class = void>
-        class vectorized_binary
-        {
-        public:
-            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
-            {
-                VecT v4;
-                for (size_t i = 0; i < v4.size(); ++i)
-                {
-                    v4[i] = binary_op(a[i], b[i]);
-                }
-                return v4;
-            }
-        };
-
-        template <typename VecT, class BinaryOperation>
-        class vectorized_binary<
-            VecT, BinaryOperation,
-            std::void_t<std::invoke_result_t<BinaryOperation, VecT, VecT>>>
-        {
-        public:
-            inline VecT operator()(VecT a, VecT b, const BinaryOperation binary_op)
-            {
-                return binary_op(a, b).template as<VecT>();
-            }
-        };
-
-        template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                                    oneapi::mkl::transpose b_trans, int m, int n, int k,
-                                    const void *alpha, const void **a, int lda,
-                                    const void **b, int ldb, const void *beta, void **c,
-                                    int ldc, int batch_size)
-        {
-            struct matrix_info_t
-            {
-                oneapi::mkl::transpose transpose_info[2];
-                Ts value_info[2];
-                std::int64_t size_info[3];
-                std::int64_t ld_info[3];
-                std::int64_t groupsize_info;
-            };
-
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-
-            matrix_info_t *matrix_info =
-                (matrix_info_t *)std::malloc(sizeof(matrix_info_t));
-            matrix_info->transpose_info[0] = a_trans;
-            matrix_info->transpose_info[1] = b_trans;
-            matrix_info->value_info[0] = alpha_value;
-            matrix_info->value_info[1] = beta_value;
-            matrix_info->size_info[0] = m;
-            matrix_info->size_info[1] = n;
-            matrix_info->size_info[2] = k;
-            matrix_info->ld_info[0] = lda;
-            matrix_info->ld_info[1] = ldb;
-            matrix_info->ld_info[2] = ldc;
-            matrix_info->groupsize_info = batch_size;
-
-            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
-                q, matrix_info->transpose_info, matrix_info->transpose_info + 1,
-                matrix_info->size_info, matrix_info->size_info + 1,
-                matrix_info->size_info + 2, matrix_info->value_info,
-                reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
-                reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
-                matrix_info->value_info + 1, reinterpret_cast<Tc **>(c),
-                matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
-
-            q.submit([&](sycl::handler &cgh)
-                     {
-    cgh.depends_on(e);
-    cgh.host_task([=] { std::free(matrix_info); }); });
-        }
-
-        template <class Ta, class Tb, class Tc, class Ts>
-        inline void
-        gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                        oneapi::mkl::transpose b_trans, int m, int n,
-                        int k, const void *alpha, const void *a, int lda,
-                        long long int stride_a, const void *b, int ldb,
-                        long long int stride_b, const void *beta, void *c,
-                        int ldc, long long int stride_c, int batch_size)
-        {
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-            auto data_a = get_memory<const Ta>(a);
-            auto data_b = get_memory<const Tb>(b);
-            auto data_c = get_memory<Tc>(c);
-            oneapi::mkl::blas::column_major::gemm_batch(
-                q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-                stride_a, data_b, ldb, stride_b, beta_value,
-                data_c, ldc, stride_c, batch_size);
-        }
-
-    } // namespace detail
-
-    template <typename VecT, class BinaryOperation>
-    inline unsigned vectorized_binary(unsigned a, unsigned b,
-                                      const BinaryOperation binary_op)
-    {
-        sycl::vec<unsigned, 1> v0{a}, v1{b};
-        auto v2 = v0.as<VecT>();
-        auto v3 = v1.as<VecT>();
-        auto v4 =
-            detail::vectorized_binary<VecT, BinaryOperation>()(v2, v3, binary_op);
-        v0 = v4.template as<sycl::vec<unsigned, 1>>();
-        return v0;
-    }
-
-    static void async_dpct_memcpy(void *to_ptr, const void *from_ptr, size_t size,
-                                  memcpy_direction direction = automatic,
-                                  sycl::queue &q = dpct::get_default_queue())
-    {
-        detail::dpct_memcpy(q, to_ptr, from_ptr, size, direction);
-    }
-
-    static inline unsigned int select_device(unsigned int id)
-    {
-        dev_mgr::instance().select_device(id);
-        return id;
-    }
-
-    template <typename T>
-    T permute_sub_group_by_xor(sycl::sub_group g, T x, unsigned int mask,
-                               int logical_sub_group_size = 32)
-    {
-        unsigned int id = g.get_local_linear_id();
-        unsigned int start_index =
-            id / logical_sub_group_size * logical_sub_group_size;
-        unsigned int target_offset = (id % logical_sub_group_size) ^ mask;
-        return sycl::select_from_group(g, x,
-                                       target_offset < logical_sub_group_size
-                                           ? start_index + target_offset
-                                           : id);
-    }
-
-    template <typename T>
-    sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val)
-    {
-        return sycl::vec<T, 1>(val)
-            .template as<sycl::vec<
-                std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
-            .template convert<T>();
-    }
-
-    template <typename T1, typename T2>
-    using dot_product_acc_t =
-        std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
-                           uint32_t, int32_t>;
-
-    template <typename T1, typename T2, typename T3>
-    inline auto dp4a(T1 a, T2 b, T3 c)
-    {
-        dot_product_acc_t<T1, T2> res = c;
-        auto va = extract_and_sign_or_zero_extend4(a);
-        auto vb = extract_and_sign_or_zero_extend4(b);
-        res += va[0] * vb[0];
-        res += va[1] * vb[1];
-        res += va[2] * vb[2];
-        res += va[3] * vb[3];
-        return res;
-    }
-
-    struct sub_sat
-    {
-        template <typename T>
-        auto operator()(const T x, const T y) const
-        {
-            return sycl::sub_sat(x, y);
-        }
-    };
-
-    template <typename S, typename T>
-    inline T vectorized_min(T a, T b)
-    {
-        sycl::vec<T, 1> v0{a}, v1{b};
-        auto v2 = v0.template as<S>();
-        auto v3 = v1.template as<S>();
-        auto v4 = sycl::min(v2, v3);
-        v0 = v4.template as<sycl::vec<T, 1>>();
-        return v0;
-    }
-
-    inline float pow(const float a, const int b) { return sycl::pown(a, b); }
-    inline double pow(const double a, const int b) { return sycl::pown(a, b); }
-    inline float pow(const float a, const float b) { return sycl::pow(a, b); }
-    inline double pow(const double a, const double b) { return sycl::pow(a, b); }
-    template <typename T, typename U>
-    inline typename std::enable_if_t<std::is_floating_point_v<T>, T>
-    pow(const T a, const U b)
-    {
-        return sycl::pow(a, static_cast<T>(b));
-    }
-    template <typename T, typename U>
-    inline typename std::enable_if_t<!std::is_floating_point_v<T>, double>
-    pow(const T a, const U b)
-    {
-        return sycl::pow(static_cast<double>(a), static_cast<double>(b));
-    }
-
-    inline double min(const double a, const float b)
-    {
-        return sycl::fmin(a, static_cast<double>(b));
-    }
-    inline double min(const float a, const double b)
-    {
-        return sycl::fmin(static_cast<double>(a), b);
-    }
-    inline float min(const float a, const float b) { return sycl::fmin(a, b); }
-    inline double min(const double a, const double b) { return sycl::fmin(a, b); }
-    inline std::uint32_t min(const std::uint32_t a, const std::int32_t b)
-    {
-        return sycl::min(a, static_cast<std::uint32_t>(b));
-    }
-    inline std::uint32_t min(const std::int32_t a, const std::uint32_t b)
-    {
-        return sycl::min(static_cast<std::uint32_t>(a), b);
-    }
-    inline std::int32_t min(const std::int32_t a, const std::int32_t b)
-    {
-        return sycl::min(a, b);
-    }
-    inline std::uint32_t min(const std::uint32_t a, const std::uint32_t b)
-    {
-        return sycl::min(a, b);
-    }
-    inline std::uint64_t min(const std::uint64_t a, const std::int64_t b)
-    {
-        return sycl::min(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t min(const std::int64_t a, const std::uint64_t b)
-    {
-        return sycl::min(static_cast<std::uint64_t>(a), b);
-    }
-    inline std::int64_t min(const std::int64_t a, const std::int64_t b)
-    {
-        return sycl::min(a, b);
-    }
-    inline std::uint64_t min(const std::uint64_t a, const std::uint64_t b)
-    {
-        return sycl::min(a, b);
-    }
-    inline std::uint64_t min(const std::uint64_t a, const std::int32_t b)
-    {
-        return sycl::min(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t min(const std::int32_t a, const std::uint64_t b)
-    {
-        return sycl::min(static_cast<std::uint64_t>(a), b);
-    }
-    inline std::uint64_t min(const std::uint64_t a, const std::uint32_t b)
-    {
-        return sycl::min(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t min(const std::uint32_t a, const std::uint64_t b)
-    {
-        return sycl::min(static_cast<std::uint64_t>(a), b);
-    }
-    // max function overloads.
-    // For floating-point types, `float` or `double` arguments are acceptable.
-    // For integer types, `std::uint32_t`, `std::int32_t`, `std::uint64_t` or
-    // `std::int64_t` type arguments are acceptable.
-    inline double max(const double a, const float b)
-    {
-        return sycl::fmax(a, static_cast<double>(b));
-    }
-    inline double max(const float a, const double b)
-    {
-        return sycl::fmax(static_cast<double>(a), b);
-    }
-    inline float max(const float a, const float b) { return sycl::fmax(a, b); }
-    inline double max(const double a, const double b) { return sycl::fmax(a, b); }
-    inline std::uint32_t max(const std::uint32_t a, const std::int32_t b)
-    {
-        return sycl::max(a, static_cast<std::uint32_t>(b));
-    }
-    inline std::uint32_t max(const std::int32_t a, const std::uint32_t b)
-    {
-        return sycl::max(static_cast<std::uint32_t>(a), b);
-    }
-    inline std::int32_t max(const std::int32_t a, const std::int32_t b)
-    {
-        return sycl::max(a, b);
-    }
-    inline std::uint32_t max(const std::uint32_t a, const std::uint32_t b)
-    {
-        return sycl::max(a, b);
-    }
-    inline std::uint64_t max(const std::uint64_t a, const std::int64_t b)
-    {
-        return sycl::max(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t max(const std::int64_t a, const std::uint64_t b)
-    {
-        return sycl::max(static_cast<std::uint64_t>(a), b);
-    }
-    inline std::int64_t max(const std::int64_t a, const std::int64_t b)
-    {
-        return sycl::max(a, b);
-    }
-    inline std::uint64_t max(const std::uint64_t a, const std::uint64_t b)
-    {
-        return sycl::max(a, b);
-    }
-    inline std::uint64_t max(const std::uint64_t a, const std::int32_t b)
-    {
-        return sycl::max(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t max(const std::int32_t a, const std::uint64_t b)
-    {
-        return sycl::max(static_cast<std::uint64_t>(a), b);
-    }
-    inline std::uint64_t max(const std::uint64_t a, const std::uint32_t b)
-    {
-        return sycl::max(a, static_cast<std::uint64_t>(b));
-    }
-    inline std::uint64_t max(const std::uint32_t a, const std::uint64_t b)
-    {
-        return sycl::max(static_cast<std::uint64_t>(a), b);
-    }
-
-    inline void
-    has_capability_or_fail(const sycl::device &dev,
-                           const std::initializer_list<sycl::aspect> &props)
-    {
-        for (const auto &it : props)
-        {
-            if (dev.has(it))
-                continue;
-            switch (it)
-            {
-            case sycl::aspect::fp64:
-                throw std::runtime_error("'double' is not supported in '" +
-                                         dev.get_info<sycl::info::device::name>() +
-                                         "' device");
-                break;
-            case sycl::aspect::fp16:
-                throw std::runtime_error("'half' is not supported in '" +
-                                         dev.get_info<sycl::info::device::name>() +
-                                         "' device");
-                break;
-            default:
-#define __SYCL_ASPECT(ASPECT, ID) \
-    case sycl::aspect::ASPECT:    \
-        return #ASPECT;
-#define __SYCL_ASPECT_DEPRECATED(ASPECT, ID, MESSAGE) __SYCL_ASPECT(ASPECT, ID)
-#define __SYCL_ASPECT_DEPRECATED_ALIAS(ASPECT, ID, MESSAGE)
-                auto getAspectNameStr = [](sycl::aspect AspectNum) -> std::string
-                {
-                    switch (AspectNum)
-                    {
-#include <sycl/info/aspects.def>
-#include <sycl/info/aspects_deprecated.def>
-                    default:
-                        return "unknown aspect";
-                    }
-                };
-#undef __SYCL_ASPECT_DEPRECATED_ALIAS
-#undef __SYCL_ASPECT_DEPRECATED
-#undef __SYCL_ASPECT
-                throw std::runtime_error(
-                    "'" + getAspectNameStr(it) + "' is not supported in '" +
-                    dev.get_info<sycl::info::device::name>() + "' device");
-            }
-            break;
-        }
-    }
-
-    static inline unsigned int get_current_device_id()
-    {
-        return dev_mgr::instance().current_device_id();
-    }
-
-    static inline device_ext &get_current_device()
-    {
-        return dev_mgr::instance().current_device();
-    }
-
-    static inline sycl::queue &get_in_order_queue()
-    {
-        return dev_mgr::instance().current_device().in_order_queue();
-    }
-
-    static sycl::event
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr, size_t size,
-                memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
-    {
-        if (!size)
-            return sycl::event{};
-#ifdef DPCT_USM_LEVEL_NONE
-        auto &mm = mem_mgr::instance();
-        auto real_direction = deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-
-        switch (real_direction)
-        {
-        case host_to_host:
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        cgh.host_task([=] { std::memcpy(to_ptr, from_ptr, size); }); });
-        case host_to_device:
-        {
-            auto alloc = mm.translate_ptr(to_ptr);
-            size_t offset = (byte_t *)to_ptr - alloc.alloc_ptr;
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        auto r = sycl::range<1>(size);
-        auto o = sycl::id<1>(offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                            sycl::access::target::device>
-            acc(alloc.buffer, cgh, r, o);
-        cgh.copy(from_ptr, acc); });
-        }
-        case device_to_host:
-        {
-            auto alloc = mm.translate_ptr(from_ptr);
-            size_t offset = (byte_t *)from_ptr - alloc.alloc_ptr;
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        auto r = sycl::range<1>(size);
-        auto o = sycl::id<1>(offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                            sycl::access::target::device>
-            acc(alloc.buffer, cgh, r, o);
-        cgh.copy(acc, to_ptr); });
-        }
-        case device_to_device:
-        {
-            auto to_alloc = mm.translate_ptr(to_ptr);
-            auto from_alloc = mm.translate_ptr(from_ptr);
-            size_t to_offset = (byte_t *)to_ptr - to_alloc.alloc_ptr;
-            size_t from_offset = (byte_t *)from_ptr - from_alloc.alloc_ptr;
-            return q.submit([&](sycl::handler &cgh)
-                            {
-        cgh.depends_on(dep_events);
-        auto r = sycl::range<1>(size);
-        auto to_o = sycl::id<1>(to_offset);
-        auto from_o = sycl::id<1>(from_offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                            sycl::access::target::device>
-            to_acc(to_alloc.buffer, cgh, r, to_o);
-        sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                            sycl::access::target::device>
-            from_acc(from_alloc.buffer, cgh, r, from_o);
-        cgh.copy(from_acc, to_acc); });
-        }
-        default:
-            throw std::runtime_error("dpct_memcpy: invalid direction value");
-        }
-#else
-        return q.memcpy(to_ptr, from_ptr, size, dep_events);
-#endif // DPCT_USM_LEVEL_NONE
-    }
-
-    // Get actual copy range and make sure it will not exceed range.
-    static inline size_t get_copy_range(sycl::range<3> size, size_t slice,
-                                        size_t pitch)
-    {
-        return slice * (size.get(2) - 1) + pitch * (size.get(1) - 1) + size.get(0);
-    }
-
-    static inline size_t get_offset(sycl::id<3> id, size_t slice,
-                                    size_t pitch)
-    {
-        return slice * id.get(2) + pitch * id.get(1) + id.get(0);
-    }
-
-    /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
-    /// and \p from_range to another specified by \p to_ptr and \p to_range.
-    static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                sycl::range<3> to_range, sycl::range<3> from_range,
-                sycl::id<3> to_id, sycl::id<3> from_id,
-                sycl::range<3> size, memcpy_direction direction,
-                const std::vector<sycl::event> &dep_events = {})
-    {
-        // RAII for host pointer
-        class host_buffer
-        {
-            void *_buf;
-            size_t _size;
-            sycl::queue &_q;
-            const std::vector<sycl::event> &_deps; // free operation depends
-
-        public:
-            host_buffer(size_t size, sycl::queue &q,
-                        const std::vector<sycl::event> &deps)
-                : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-            void *get_ptr() const { return _buf; }
-            size_t get_size() const { return _size; }
-            ~host_buffer()
-            {
-                if (_buf)
-                {
-                    _q.submit([&](sycl::handler &cgh)
-                              {
-            cgh.depends_on(_deps);
-            cgh.host_task([buf = _buf] { std::free(buf); }); });
-                }
-            }
-        };
-        std::vector<sycl::event> event_list;
-
-        size_t to_slice = to_range.get(1) * to_range.get(0),
-               from_slice = from_range.get(1) * from_range.get(0);
-        unsigned char *to_surface =
-            (unsigned char *)to_ptr + get_offset(to_id, to_slice, to_range.get(0));
-        const unsigned char *from_surface =
-            (const unsigned char *)from_ptr +
-            get_offset(from_id, from_slice, from_range.get(0));
-
-        if (to_slice == from_slice && to_slice == size.get(1) * size.get(0))
-        {
-            return {dpct_memcpy(q, to_surface, from_surface, to_slice * size.get(2),
-                                direction, dep_events)};
-        }
-        direction = detail::deduce_memcpy_direction(q, to_ptr, from_ptr, direction);
-        size_t size_slice = size.get(1) * size.get(0);
-        switch (direction)
-        {
-        case host_to_host:
-            for (size_t z = 0; z < size.get(2); ++z)
-            {
-                unsigned char *to_ptr = to_surface;
-                const unsigned char *from_ptr = from_surface;
-                if (to_range.get(0) == from_range.get(0) &&
-                    to_range.get(0) == size.get(0))
-                {
-                    event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size_slice,
-                                                     direction, dep_events));
-                }
-                else
-                {
-                    for (size_t y = 0; y < size.get(1); ++y)
-                    {
-                        event_list.push_back(dpct_memcpy(q, to_ptr, from_ptr, size.get(0),
-                                                         direction, dep_events));
-                        to_ptr += to_range.get(0);
-                        from_ptr += from_range.get(0);
-                    }
-                }
-                to_surface += to_slice;
-                from_surface += from_slice;
-            }
-            break;
-        case host_to_device:
-        {
-            host_buffer buf(get_copy_range(size, to_slice, to_range.get(0)), q,
-                            event_list);
-            std::vector<sycl::event> host_events;
-            if (to_slice == size_slice)
-            {
-                // Copy host data to a temp host buffer with the shape of target.
-                host_events =
-                    dpct_memcpy(q, buf.get_ptr(), from_surface, to_range, from_range,
-                                sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size,
-                                host_to_host, dep_events);
-            }
-            else
-            {
-                // Copy host data to a temp host buffer with the shape of target.
-                host_events = dpct_memcpy(
-                    q, buf.get_ptr(), from_surface, to_range, from_range,
-                    sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0), size, host_to_host,
-                    // If has padding data, not sure whether it is useless. So fill temp
-                    // buffer with it.
-                    std::vector<sycl::event>{
-                        dpct_memcpy(q, buf.get_ptr(), to_surface, buf.get_size(),
-                                    device_to_host, dep_events)});
-            }
-            // Copy from temp host buffer to device with only one submit.
-            event_list.push_back(dpct_memcpy(q, to_surface, buf.get_ptr(),
-                                             buf.get_size(), host_to_device,
-                                             host_events));
-            break;
-        }
-        case device_to_host:
-        {
-            host_buffer buf(get_copy_range(size, from_slice, from_range.get(0)), q,
-                            event_list);
-            // Copy from host temp buffer to host target with reshaping.
-            event_list = dpct_memcpy(
-                q, to_surface, buf.get_ptr(), to_range, from_range, sycl::id<3>(0, 0, 0),
-                sycl::id<3>(0, 0, 0), size, host_to_host,
-                // Copy from device to temp host buffer with only one submit.
-                std::vector<sycl::event>{dpct_memcpy(q, buf.get_ptr(), from_surface,
-                                                     buf.get_size(),
-                                                     device_to_host, dep_events)});
-            break;
-        }
-        case device_to_device:
-#ifdef DPCT_USM_LEVEL_NONE
-        {
-            auto &mm = mem_mgr::instance();
-            auto to_alloc = mm.translate_ptr(to_surface);
-            auto from_alloc = mm.translate_ptr(from_surface);
-            size_t to_offset = (byte_t *)to_surface - to_alloc.alloc_ptr;
-            size_t from_offset = (byte_t *)from_surface - from_alloc.alloc_ptr;
-            event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                          {
-        cgh.depends_on(dep_events);
-        auto to_o = sycl::id<1>(to_offset);
-        auto from_o = sycl::id<1>(from_offset);
-        sycl::accessor<byte_t, 1, sycl::access_mode::write,
-                            sycl::access::target::device>
-            to_acc(to_alloc.buffer, cgh,
-                    get_copy_range(size, to_slice, to_range.get(0)), to_o);
-        sycl::accessor<byte_t, 1, sycl::access_mode::read,
-                            sycl::access::target::device>
-            from_acc(from_alloc.buffer, cgh,
-                    get_copy_range(size, from_slice, from_range.get(0)), from_o);
-        cgh.parallel_for<class dpct_memcpy_3d_detail_usmnone>(
-            size,
-            [=](sycl::id<3> id) {
-                to_acc[get_offset(id, to_slice, to_range.get(0))] =
-                    from_acc[get_offset(id, from_slice, from_range.get(0))];
-            }); }));
-        }
-#else
-            event_list.push_back(q.submit([&](sycl::handler &cgh)
-                                          {
-        cgh.depends_on(dep_events);
-        cgh.parallel_for<class dpct_memcpy_3d_detail>(
-            size,
-            [=](sycl::id<3> id) {
-                to_surface[get_offset(id, to_slice, to_range.get(0))] =
-                    from_surface[get_offset(id, from_slice, from_range.get(0))];
-            }); }));
-#endif
-        break;
-        default:
-            throw std::runtime_error("dpct_memcpy: invalid direction value");
-        }
-        return event_list;
-    }
-
-    /// memcpy 2D/3D matrix specified by pitched_data.
-    static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, pitched_data to, sycl::id<3> to_id,
-                pitched_data from, sycl::id<3> from_id, sycl::range<3> size,
-                memcpy_direction direction = automatic)
-    {
-        return dpct_memcpy(q, to.get_data_ptr(), from.get_data_ptr(),
-                           sycl::range<3>(to.get_pitch(), to.get_y(), 1),
-                           sycl::range<3>(from.get_pitch(), from.get_y(), 1), to_id, from_id,
-                           size, direction);
-    }
-
-    /// memcpy 2D matrix with pitch.
-    static inline std::vector<sycl::event>
-    dpct_memcpy(sycl::queue &q, void *to_ptr, const void *from_ptr,
-                size_t to_pitch, size_t from_pitch, size_t x, size_t y,
-                memcpy_direction direction = automatic)
-    {
-        return dpct_memcpy(q, to_ptr, from_ptr, sycl::range<3>(to_pitch, y, 1),
-                           sycl::range<3>(from_pitch, y, 1),
-                           sycl::id<3>(0, 0, 0), sycl::id<3>(0, 0, 0),
-                           sycl::range<3>(x, y, 1), direction);
-    }
-
-    inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                     oneapi::mkl::transpose b_trans, int m, int n, int k,
-                     const void *alpha, const void *a, library_data_t a_type,
-                     int lda, const void *b, library_data_t b_type, int ldb,
-                     const void *beta, void *c, library_data_t c_type, int ldc,
-                     library_data_t scaling_type)
-    {
-        if (scaling_type == library_data_t::real_float &&
-            c_type == library_data_t::complex_float)
-        {
-            scaling_type = library_data_t::complex_float;
-        }
-        else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
-        {
-            scaling_type = library_data_t::complex_double;
-        }
-
-        std::uint64_t key =
-            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-        switch (key)
-        {
-        case detail::get_type_combination_id(
-            library_data_t::real_float, library_data_t::real_float,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_double, library_data_t::real_double,
-            library_data_t::real_double, library_data_t::real_double):
-        {
-            detail::gemm_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_impl<std::complex<float>, std::complex<float>,
-                              std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_impl<std::complex<double>, std::complex<double>,
-                              std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_half):
-        {
-            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                              sycl::half>(q, a_trans, b_trans, m, n, k, alpha, a,
-                                          lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
-                                     ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_float):
-        {
-            float alpha_value =
-                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-            float beta_value =
-                dpct::get_value(reinterpret_cast<const float *>(beta), q);
-            sycl::half alpha_half(alpha_value);
-            sycl::half beta_half(beta_value);
-            detail::gemm_impl<sycl::half, sycl::half, sycl::half,
-                              sycl::half>(q, a_trans, b_trans, m, n, k, &alpha_half,
-                                          a, lda, b, ldb, &beta_half, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_bfloat16, library_data_t::real_float):
-        {
-            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                              oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_int32, library_data_t::real_int32):
-        {
-            float alpha_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-            float beta_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-            detail::gemm_impl<std::int8_t, std::int8_t, std::int32_t, float>(
-                q, a_trans, b_trans, m, n, k, &alpha_float, a, lda, b, ldb, &beta_float, c, ldc);
-            break;
-        }
-        default:
-            throw std::runtime_error("the combination of data type is unsupported");
-        }
-    } // gemm()
-
-    /// Computes a batch of matrix-matrix product with general matrices.
-    /// \param [in] q The queue where the routine should be executed.
-    /// \param [in] a_trans Specifies the operation applied to A.
-    /// \param [in] b_trans Specifies the operation applied to B.
-    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-    /// \param [in] alpha Scaling factor for the matrix-matrix product.
-    /// \param [in] a Input matrix A.
-    /// \param [in] a_type Data type of the matrix A.
-    /// \param [in] lda Leading dimension of A.
-    /// \param [in] b Input matrix B.
-    /// \param [in] b_type Data type of the matrix B.
-    /// \param [in] ldb Leading dimension of B.
-    /// \param [in] beta Scaling factor for matrix C.
-    /// \param [in, out] c Input/Output matrix C.
-    /// \param [in] c_type Data type of the matrix C.
-    /// \param [in] ldc Leading dimension of C.
-    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-    /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                           oneapi::mkl::transpose b_trans, int m, int n, int k,
-                           const void *alpha, const void *a[],
-                           library_data_t a_type, int lda, const void *b[],
-                           library_data_t b_type, int ldb, const void *beta,
-                           void *c[], library_data_t c_type, int ldc,
-                           int batch_size, library_data_t scaling_type)
-    {
-#ifdef DPCT_USM_LEVEL_NONE
-        throw std::runtime_error("this API is unsupported when USM level is none");
-#else
-        if (scaling_type == library_data_t::real_float &&
-            c_type == library_data_t::complex_float)
-        {
-            scaling_type = library_data_t::complex_float;
-        }
-        else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
-        {
-            scaling_type = library_data_t::complex_double;
-        }
-
-        std::uint64_t key =
-            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-        switch (key)
-        {
-        case detail::get_type_combination_id(
-            library_data_t::real_float, library_data_t::real_float,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_double, library_data_t::real_double,
-            library_data_t::real_double, library_data_t::real_double):
-        {
-            detail::gemm_batch_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                                    std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                                    std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_half):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                                a, lda, b, ldb, beta, c, ldc,
-                                                batch_size);
-            break;
-        }
-#ifdef __INTEL_MKL__
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_bfloat16, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                                    oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                           b, ldb, beta, c, ldc, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_int32, library_data_t::real_int32):
-        {
-            float alpha_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(alpha), q);
-            float beta_float =
-                dpct::get_value(reinterpret_cast<const std::int32_t *>(beta), q);
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                                    float>(q, a_trans, b_trans, m, n, k, &alpha_float,
-                                           a, lda, b, ldb, &beta_float, c, ldc,
-                                           batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc,
-                batch_size);
-            break;
-        }
-#endif
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_float):
-        {
-            float alpha_value =
-                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-            float beta_value =
-                dpct::get_value(reinterpret_cast<const float *>(beta), q);
-            sycl::half alpha_half(alpha_value);
-            sycl::half beta_half(beta_value);
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, b, ldb, &beta_half, c, ldc,
-                batch_size);
-            break;
-        }
-        default:
-            throw std::runtime_error("the combination of data type is unsupported");
-        }
-#endif
-    }
-
-    /// Computes a batch of matrix-matrix product with general matrices.
-    /// \param [in] q The queue where the routine should be executed.
-    /// \param [in] a_trans Specifies the operation applied to A.
-    /// \param [in] b_trans Specifies the operation applied to B.
-    /// \param [in] m Specifies the number of rows of the matrix op(A) and of the matrix C.
-    /// \param [in] n Specifies the number of columns of the matrix op(B) and of the matrix C.
-    /// \param [in] k Specifies the number of columns of the matrix op(A) and the number of rows of the matrix op(B).
-    /// \param [in] alpha Scaling factor for the matrix-matrix product.
-    /// \param [in] a Input matrix A.
-    /// \param [in] a_type Data type of the matrix A.
-    /// \param [in] lda Leading dimension of A.
-    /// \param [in] stride_a Stride between the different A matrices.
-    /// \param [in] b Input matrix B.
-    /// \param [in] b_type Data type of the matrix B.
-    /// \param [in] ldb Leading dimension of B.
-    /// \param [in] stride_b Stride between the different B matrices.
-    /// \param [in] beta Scaling factor for matrix C.
-    /// \param [in, out] c Input/Output matrix C.
-    /// \param [in] c_type Data type of the matrix C.
-    /// \param [in] ldc Leading dimension of C.
-    /// \param [in] stride_c Stride between the different C matrices.
-    /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
-    /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                           oneapi::mkl::transpose b_trans, int m, int n, int k,
-                           const void *alpha, const void *a, library_data_t a_type,
-                           int lda, long long int stride_a, const void *b,
-                           library_data_t b_type, int ldb, long long int stride_b,
-                           const void *beta, void *c, library_data_t c_type,
-                           int ldc, long long int stride_c, int batch_size,
-                           library_data_t scaling_type)
-    {
-        if (scaling_type == library_data_t::real_float &&
-            c_type == library_data_t::complex_float)
-        {
-            scaling_type = library_data_t::complex_float;
-        }
-        else if (scaling_type == library_data_t::real_double &&
-                 c_type == library_data_t::complex_double)
-        {
-            scaling_type = library_data_t::complex_double;
-        }
-
-        std::uint64_t key =
-            detail::get_type_combination_id(a_type, b_type, c_type, scaling_type);
-        switch (key)
-        {
-        case detail::get_type_combination_id(
-            library_data_t::real_float, library_data_t::real_float,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<float, float, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_double, library_data_t::real_double,
-            library_data_t::real_double, library_data_t::real_double):
-        {
-            detail::gemm_batch_impl<double, double, double, double>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_float, library_data_t::complex_float,
-            library_data_t::complex_float, library_data_t::complex_float):
-        {
-            detail::gemm_batch_impl<std::complex<float>, std::complex<float>,
-                                    std::complex<float>, std::complex<float>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::complex_double, library_data_t::complex_double,
-            library_data_t::complex_double, library_data_t::complex_double):
-        {
-            detail::gemm_batch_impl<std::complex<double>, std::complex<double>,
-                                    std::complex<double>, std::complex<double>>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_half):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half,
-                                    sycl::half>(q, a_trans, b_trans, m, n, k, alpha,
-                                                a, lda, stride_a, b, ldb, stride_b,
-                                                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-#ifdef __INTEL_MKL__
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_bfloat16, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                                    oneapi::mkl::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_bfloat16, library_data_t::real_bfloat16,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
-                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                           stride_a, b, ldb, stride_b, beta, c, ldc,
-                                           stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_int32, library_data_t::real_int32):
-        {
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, std::int32_t,
-                                    std::int32_t>(q, a_trans, b_trans, m, n, k, alpha,
-                                                  a, lda, stride_a, b, ldb, stride_b,
-                                                  beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_int8, library_data_t::real_int8,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<std::int8_t, std::int8_t, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_float, library_data_t::real_float):
-        {
-            detail::gemm_batch_impl<sycl::half, sycl::half, float, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
-            break;
-        }
-#endif
-        case detail::get_type_combination_id(
-            library_data_t::real_half, library_data_t::real_half,
-            library_data_t::real_half, library_data_t::real_float):
-        {
-            float alpha_value =
-                dpct::get_value(reinterpret_cast<const float *>(alpha), q);
-            float beta_value =
-                dpct::get_value(reinterpret_cast<const float *>(beta), q);
-            sycl::half alpha_half(alpha_value);
-            sycl::half beta_half(beta_value);
-            detail::gemm_batch_impl<sycl::half, sycl::half, sycl::half, sycl::half>(
-                q, a_trans, b_trans, m, n, k, &alpha_half, a, lda, stride_a, b, ldb, stride_b,
-                &beta_half, c, ldc, stride_c, batch_size);
-            break;
-        }
-        default:
-            throw std::runtime_error("the combination of data type is unsupported");
-        }
-    }
-
-    static inline void
-    async_dpct_memcpy(void *to_ptr, size_t to_pitch, const void *from_ptr,
-                      size_t from_pitch, size_t x, size_t y,
-                      memcpy_direction direction = automatic,
-                      sycl::queue &q = get_default_queue())
-    {
-        detail::dpct_memcpy(q, to_ptr, from_ptr, to_pitch, from_pitch, x, y,
-                            direction);
-    }
-
-    using err0 = detail::generic_error_type<struct err0_tag, int>;
-    using err1 = detail::generic_error_type<struct err1_tag, int>;
-
-} // COPY from DPCT head files
-
-#endif
+#endif
\ No newline at end of file

From 7a44a95b08fa8fb23cbafcf2c4f4573b1e6874c6 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 14:39:46 +0800
Subject: [PATCH 59/90] update CI/action for sycl code, fix CI error of
 repeat/dup

---
 .github/workflows/build.yml | 45 +++++++++++++++++++++
 ci/README.md                |  4 ++
 ci/run.sh                   | 11 +++++
 ggml-sycl.cpp               | 80 ++++++++++++++++++++++++++++++++++---
 4 files changed, 134 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index c3aa6f9924db8..820878a8f4d22 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -143,6 +143,51 @@ jobs:
           cd build
           ctest --verbose
 
+  ubuntu-22.04-cmake-sycl:
+    runs-on: ubuntu-22.04
+
+    continue-on-error: true
+
+    strategy:
+      matrix:
+        mpi_library: [mpich, libopenmpi-dev]
+
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: add oneAPI to apt
+        shell: bash
+        run: |
+          cd /tmp
+          wget https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo apt-key add GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          rm GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+          sudo add-apt-repository "deb https://apt.repos.intel.com/oneapi all main"
+
+      - name: install oneAPI dpcpp compiler
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install intel-oneapi-compiler-dpcpp-cpp
+
+      - name: install oneAPI MKL library
+        shell: bash
+        run: |
+          sudo apt install intel-oneapi-mkl-devel
+
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Build
+        id: cmake_build
+        run: |
+          source /opt/intel/oneapi/setvars.sh
+          mkdir build
+          cd build
+          cmake -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
+          cmake --build . --config Release -j $(nproc)
+
   # TODO: build with LLAMA_NO_METAL because test-backend-ops fail on "Apple Paravirtual device" and I don't know
   #       how to debug it.
   #       ref: https://github.com/ggerganov/llama.cpp/actions/runs/7131777249/job/19420981052#step:5:1124
diff --git a/ci/README.md b/ci/README.md
index 65cfe63ebe8e9..4064705190697 100644
--- a/ci/README.md
+++ b/ci/README.md
@@ -22,4 +22,8 @@ bash ./ci/run.sh ./tmp/results ./tmp/mnt
 
 # with CUDA support
 GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+
+# with SYCL support
+source /opt/intel/oneapi/setvars.sh
+GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 ```
diff --git a/ci/run.sh b/ci/run.sh
index 791b17a191a2b..755461e68e7ee 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -10,6 +10,9 @@
 # # with CUDA support
 # GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
 #
+# # with SYCL support
+# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
+#
 
 if [ -z "$2" ]; then
     echo "usage: $0 <output-dir> <mnt-dir>"
@@ -40,6 +43,14 @@ if [ ! -z ${GG_BUILD_CUDA} ]; then
     CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_CUBLAS=1"
 fi
 
+if [ ! -z ${GG_BUILD_SYCL} ]; then
+    if [ -z ${ONEAPI_ROOT} ]; then
+      echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+      exit 1
+    fi
+
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx"
+fi
 ## helpers
 
 # download a file if it does not exist or if it is outdated
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 44ba952005669..dba62bd22fdba 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -3269,10 +3269,6 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo
         // printf("local buf %p size %d bytes\n", local_buf, total_size);
         ggml_sycl_set_device(g_main_device);
         dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
-
-        // printf("zjy before memcpy local_buf=%p, src->data=%p\n", local_buf, src->data);
-        printf("zjy log dst_ddf=%p main_stream=%p g_main_device_index=%d\n", src,
-            main_stream, g_main_device_index);
         main_stream->memcpy(local_buf, src, total_size);
     }
     else {
@@ -7657,6 +7653,20 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) {
     *dsti = *xi;
 }
 
+static void cpy_1_i16_i16(const char * cxi, char * cdsti) {
+    const int16_t *xi = (const int16_t *)cxi;
+    int16_t *dsti = (int16_t *)cdsti;
+
+    *dsti = *xi;
+}
+
+static void cpy_1_i32_i32(const char * cxi, char * cdsti) {
+    const int32_t *xi = (const int32_t *)cxi;
+    int32_t *dsti = (int32_t *)cdsti;
+
+    *dsti = *xi;
+}
+
 template <cpy_kernel_t cpy_1>
 static void cpy_f32_f16(const char * cx, char * cdst, const int ne,
                                    const int ne00, const int ne01, const int nb00, const int nb01, const int nb02,
@@ -10678,6 +10688,56 @@ static void ggml_cpy_f16_f16_sycl(const char *cx, char *cdst, const int ne,
     }
 }
 
+static void ggml_cpy_i16_i16_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        // dpct::has_capability_or_fail(stream->get_device(),
+        //                              {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_i16_i16>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
+static void ggml_cpy_i32_i32_sycl(const char *cx, char *cdst, const int ne,
+                                  const int ne00, const int ne01,
+                                  const int nb00, const int nb01,
+                                  const int nb02, const int ne10,
+                                  const int ne11, const int nb10,
+                                  const int nb11, const int nb12,
+                                  dpct::queue_ptr stream) {
+
+    const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE;
+    {
+        // dpct::has_capability_or_fail(stream->get_device(),
+        //                              {sycl::aspect::fp16});
+
+        stream->parallel_for(
+            sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) *
+                                  sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE),
+                              sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)),
+            [=](sycl::nd_item<3> item_ct1) {
+                cpy_f32_f16<cpy_1_i32_i32>(cx, cdst, ne, ne00, ne01, nb00, nb01,
+                                           nb02, ne10, ne11, nb10, nb11, nb12,
+                                           item_ct1);
+            });
+    }
+}
+
 static void scale_f32_sycl(const float *x, float *dst, const float scale,
                            const int k, dpct::queue_ptr stream) {
     const int num_blocks = (k + SYCL_SCALE_BLOCK_SIZE - 1) / SYCL_SCALE_BLOCK_SIZE;
@@ -11550,8 +11610,6 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
                                    float *dst_dd,
                                    const dpct::queue_ptr &main_stream) {
 
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
     if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
         op()(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
@@ -11560,6 +11618,12 @@ inline void ggml_sycl_op_bin_bcast(const ggml_tensor *src0,
     } else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
         op()(src0, src1, dst, (const sycl::half *)src0_dd, src1_dd, dst_dd,
              main_stream);
+    } else if (src0->type == GGML_TYPE_I32 && dst->type == GGML_TYPE_I32) {
+        op()(src0, src1, dst, (const int32_t *)src0_dd, (const int32_t *)src1_dd, (int32_t *)dst_dd,
+             main_stream);
+    } else if (src0->type == GGML_TYPE_I16 && dst->type == GGML_TYPE_I16) {
+        op()(src0, src1, dst, (const int16_t *)src0_dd, (const int16_t *)src1_dd, (int16_t *)dst_dd,
+             main_stream);
     } else {
         fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
             ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
@@ -13845,6 +13909,10 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
         ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
         ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) {
+        ggml_cpy_i16_i16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
+    } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_I32) {
+        ggml_cpy_i32_i32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, nb00, nb01, nb02, ne10, ne11, nb10, nb11, nb12, main_stream);
     } else {
         fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
                 ggml_type_name(src0->type), ggml_type_name(src1->type));

From 7babd76903b81e62527f99ca1cfbfc47fed646e0 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 14:53:04 +0800
Subject: [PATCH 60/90] fix action ID format issue

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 820878a8f4d22..386227b4d99f5 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -143,7 +143,7 @@ jobs:
           cd build
           ctest --verbose
 
-  ubuntu-22.04-cmake-sycl:
+  ubuntu-22-cmake-sycl:
     runs-on: ubuntu-22.04
 
     continue-on-error: true

From 04a46c46f8ccd5104e5dd242ba6027bbddcbc687 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 14:55:59 +0800
Subject: [PATCH 61/90] rm unused strategy

---
 .github/workflows/build.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 386227b4d99f5..6dcb432d625cc 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -148,10 +148,6 @@ jobs:
 
     continue-on-error: true
 
-    strategy:
-      matrix:
-        mpi_library: [mpich, libopenmpi-dev]
-
     steps:
       - uses: actions/checkout@v2
 

From 799af056199562f9394122b32b80b587b1921dbd Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Wed, 24 Jan 2024 07:45:36 +0000
Subject: [PATCH 62/90] enable llama_f16 in ci

---
 ci/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/run.sh b/ci/run.sh
index 755461e68e7ee..a98f5a965ca6a 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -49,7 +49,7 @@ if [ ! -z ${GG_BUILD_SYCL} ]; then
       exit 1
     fi
 
-    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx"
+    CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"
 fi
 ## helpers
 

From ec5c8bc0c93d30da8909d01978c7803ec7c91305 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 16:34:41 +0800
Subject: [PATCH 63/90] fix conflict

---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f70a3904f8aa7..7c3a82b7abe65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -504,6 +504,13 @@ function(get_flags CCID CCVER)
         if (CCVER VERSION_GREATER_EQUAL 8.1.0)
             set(CXX_FLAGS ${CXX_FLAGS} -Wextra-semi)
         endif()
+    elseif (CCID MATCHES "Intel")
+        if (NOT LLAMA_SYCL)
+          # enable max optimization level when using Intel compiler
+          set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+          set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+          add_link_options(-fuse-ld=lld -static-intel)
+        endif()
     endif()
 
     set(GF_C_FLAGS   ${C_FLAGS}   PARENT_SCOPE)

From 22e1b45c02a4b532cd01b9330d034d2b6696e1cb Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 17:24:24 +0800
Subject: [PATCH 64/90] fix build break on MacOS, due to CI of MacOS depend on
 external ggml, instead of internal ggml

---
 llama.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llama.cpp b/llama.cpp
index ce74fb557aff1..b8d0f1bbae758 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6703,10 +6703,14 @@ static int llama_decode_internal(
     }
 
     const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
-    if ((ggml_cpu_has_cublas() || ggml_cpu_has_sycl()) && fully_offloaded) {
+    if (ggml_cpu_has_cublas() && fully_offloaded) {
         n_threads = 1;
     }
 
+#ifdef GGML_USE_SYCL
+    n_threads = 1;
+#endif
+
 #ifdef GGML_USE_MPI
     const int64_t n_layer = hparams.n_layer;
     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);

From 67de350e792f66ff33ef811c18d6712ea0e254ef Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 19:51:08 +0800
Subject: [PATCH 65/90] fix ci cases for unsupported data type

---
 ggml-sycl.cpp | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index dba62bd22fdba..f24065117a92a 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -15028,6 +15028,14 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
                 if (a->ne[3] != b->ne[3]) {
                     return false;
                 }
+
+                if (a->type == GGML_TYPE_IQ2_XXS) {
+                    return false;
+                }
+                if (a->type == GGML_TYPE_IQ2_XS) {
+                    return false;
+                }
+
                 return true;
             } break;
         case GGML_OP_GET_ROWS:
@@ -15069,6 +15077,15 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
                 }
                 return false;
             } break;
+        case GGML_OP_CONCAT:
+            {
+                ggml_type src0_type = op->src[0]->type;
+                if (src0_type == GGML_TYPE_F32) {
+                    return true;
+                } else {
+                    return false;
+                }
+            } break;
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
         case GGML_OP_VIEW:
@@ -15093,7 +15110,6 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten
         case GGML_OP_SUM_ROWS:
         case GGML_OP_ARGSORT:
         case GGML_OP_ACC:
-        case GGML_OP_CONCAT:
         case GGML_OP_GROUP_NORM:
         case GGML_OP_UPSCALE:
         case GGML_OP_PAD:

From fb15de38efdecab93d73cb52d413d32ea2f38924 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Wed, 24 Jan 2024 12:18:30 +0000
Subject: [PATCH 66/90] revert unrelated changed in cuda cmake remove useless
 nommq fix typo of GGML_USE_CLBLAS_SYCL

---
 CMakeLists.txt    |  8 +++++++-
 common/common.cpp | 20 +++++---------------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7c3a82b7abe65..e766833c19721 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -345,10 +345,16 @@ if (LLAMA_CUBLAS)
         add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
 
         if (LLAMA_STATIC)
-            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            if (WIN32)
+                # As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
+                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
+            else ()
+                set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+            endif()
         else()
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
+	set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
 
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
         # 52 == lowest CUDA 12 standard
diff --git a/common/common.cpp b/common/common.cpp
index a8abf04ebd939..1bd2bc1ecaf02 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -603,9 +603,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.main_gpu = std::stoi(argv[i]);
-#ifndef GGML_USE_CLBLAS_SYCL
+#ifndef GGML_USE_CUBLAS_SYCL
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n");
-#endif // GGML_USE_CLBLAS_SYCL
+#endif // GGML_USE_CUBLAS_SYCL
         } else if (arg == "--split-mode" || arg == "-sm") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -622,9 +622,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#ifndef GGML_USE_CLBLAS_SYCL
+#ifndef GGML_USE_CUBLAS_SYCL
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n");
-#endif // GGML_USE_CLBLAS_SYCL
+#endif // GGML_USE_CUBLAS_SYCL
 
         } else if (arg == "--tensor-split" || arg == "-ts") {
             if (++i >= argc) {
@@ -651,7 +651,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
             }
 #else
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
-#endif // GGML_USE_CLBLAS_SYCL
+#endif // GGML_USE_CUBLAS_SYCL
         } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
 #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
             params.mul_mat_q = false;
@@ -1020,16 +1020,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -mg i, --main-gpu i   the GPU to use for the model (with split-mode = none),\n");
     printf("                        or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu);
 #endif // LLAMA_SUPPORTS_GPU_OFFLOAD
-#ifdef GGML_USE_CLBLAS
-    printf("  -nommq, --no-mul-mat-q\n");
-    printf("                        use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n");
-    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
-#endif // GGML_USE_CLBLAS
-#ifdef GGML_USE_SYCL
-    printf("  -nommq, --no-mul-mat-q\n");
-    printf("                        use " GGML_SYCL_NAME " instead of custom mul_mat_q " GGML_SYCL_NAME " kernels.\n");
-    printf("                        Not recommended since this is both slower and uses more VRAM.\n");
-#endif // GGML_USE_SYCL
     printf("  --verbose-prompt      print a verbose prompt before generation (default: %s)\n", params.verbose_prompt ? "true" : "false");
     printf("  --no-display-prompt   don't print prompt at generation (default: %s)\n", !params.display_prompt ? "true" : "false");
     printf("  -gan N, --grp-attn-n N\n");

From 96186a742a6eaa64718452b8f269f17e87a549f0 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Wed, 24 Jan 2024 12:23:23 +0000
Subject: [PATCH 67/90] revert hip cmake changes

---
 CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e766833c19721..081fd6c5c0bba 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -354,6 +354,7 @@ if (LLAMA_CUBLAS)
         else()
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
+
 	set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
 
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
@@ -432,6 +433,9 @@ if (LLAMA_HIPBLAS)
     if (${hipblas_FOUND} AND ${hip_FOUND})
         message(STATUS "HIP and hipBLAS found")
         add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
+	if (LLAMA_HIP_UMA)
+            add_compile_definitions(GGML_HIP_UMA)
+        endif()
         add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
         if (BUILD_SHARED_LIBS)
             set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)

From d07a88d448894e35fdc3b61c8ce8568bec381d9e Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Wed, 24 Jan 2024 12:29:17 +0000
Subject: [PATCH 68/90] fix indent

---
 CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 081fd6c5c0bba..100ffe2c103b5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -355,7 +355,7 @@ if (LLAMA_CUBLAS)
             set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
         endif()
 
-	set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
 
     if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
         # 52 == lowest CUDA 12 standard
@@ -433,7 +433,7 @@ if (LLAMA_HIPBLAS)
     if (${hipblas_FOUND} AND ${hip_FOUND})
         message(STATUS "HIP and hipBLAS found")
         add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
-	if (LLAMA_HIP_UMA)
+        if (LLAMA_HIP_UMA)
             add_compile_definitions(GGML_HIP_UMA)
         endif()
         add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)

From 8dd1b60ac6162120cd668690988123b0fbb5c2a6 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Wed, 24 Jan 2024 20:34:44 +0800
Subject: [PATCH 69/90] add prefix in func name

---
 examples/sycl/ls-sycl-device.cpp | 2 +-
 ggml-sycl.cpp                    | 5 +++--
 ggml-sycl.h                      | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp
index 8623314319515..b1c65a84513fb 100644
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -1,6 +1,6 @@
 #include "ggml-sycl.h"
 
 int main(int argc, char ** argv) {
-    print_sycl_devices();
+    ggml_backend_sycl_print_sycl_devices();
     return 0;
 }
\ No newline at end of file
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index f24065117a92a..51ceeade744ef 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -11238,7 +11238,8 @@ static bool g_sycl_loaded = false;
 bool ggml_sycl_loaded(void) {
     return g_sycl_loaded;
 }
-void print_sycl_devices(){
+
+void ggml_backend_sycl_print_sycl_devices(){
     int device_count = dpct::dev_mgr::instance().device_count();
     fprintf(stderr, "found %d SYCL devices:\n", device_count);
     for (int id = 0; id < device_count; ++id) {
@@ -11311,7 +11312,7 @@ void ggml_init_sycl() try {
 #else
         fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__);
 #endif
-        print_sycl_devices();
+        ggml_backend_sycl_print_sycl_devices();
         for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) {
             g_sycl_device_id2index[id].index = -1;
             g_device_caps[id].vmm = 0;
diff --git a/ggml-sycl.h b/ggml-sycl.h
index 02bfa72072124..b049718add1c9 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -15,7 +15,7 @@ GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, s
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-GGML_API void   print_sycl_devices();
+GGML_API void   ggml_backend_sycl_print_sycl_devices();
 
 #ifdef  __cplusplus
 }

From 3aabd8a274817492861b5aa01507b21cae8ed2aa Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Wed, 24 Jan 2024 12:37:29 +0000
Subject: [PATCH 70/90] revert no mmq

---
 common/common.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1bd2bc1ecaf02..26351bdb10f26 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -631,7 +631,6 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 invalid_param = true;
                 break;
             }
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
             std::string arg_next = argv[i];
 
             // split string by , and /
@@ -649,15 +648,9 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                     params.tensor_split[i] = 0.0f;
                 }
             }
-#else
+#ifndef GGML_USE_CUBLAS_SYCL
             fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting a tensor split has no effect.\n");
 #endif // GGML_USE_CUBLAS_SYCL
-        } else if (arg == "--no-mul-mat-q" || arg == "-nommq") {
-#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL)
-            params.mul_mat_q = false;
-#else
-            fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS or SYCL. Disabling mul_mat_q kernels has no effect.\n");
-#endif // GGML_USE_CUBLAS || GGML_USE_SYCL
         } else if (arg == "--no-mmap") {
             params.use_mmap = false;
         } else if (arg == "--numa") {

From 18742f7afc90b4c21df22abbe78053dc0fa4c3c0 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Wed, 24 Jan 2024 18:14:45 +0530
Subject: [PATCH 71/90] rm cpu blas duplicate

---
 common/common.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 26351bdb10f26..2880136761afb 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1519,7 +1519,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "cpu_has_avx512: %s\n",      ggml_cpu_has_avx512()      ? "true" : "false");
     fprintf(stream, "cpu_has_avx512_vbmi: %s\n", ggml_cpu_has_avx512_vbmi() ? "true" : "false");
     fprintf(stream, "cpu_has_avx512_vnni: %s\n", ggml_cpu_has_avx512_vnni() ? "true" : "false");
-    fprintf(stream, "cpu_has_blas: %s\n",        ggml_cpu_has_blas()        ? "true" : "false");
     fprintf(stream, "cpu_has_cublas: %s\n",      ggml_cpu_has_cublas()      ? "true" : "false");
     fprintf(stream, "cpu_has_clblast: %s\n",     ggml_cpu_has_clblast()     ? "true" : "false");
     fprintf(stream, "cpu_has_fma: %s\n",         ggml_cpu_has_fma()         ? "true" : "false");

From 0e235fb816cab846cc27debd065877335c31dd7c Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Wed, 24 Jan 2024 13:16:38 +0000
Subject: [PATCH 72/90] fix no_new_line

---
 examples/sycl/ls-sycl-device.cpp | 2 +-
 ggml-sycl.h                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp
index b1c65a84513fb..2f9d8f78e61ad 100644
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -3,4 +3,4 @@
 int main(int argc, char ** argv) {
     ggml_backend_sycl_print_sycl_devices();
     return 0;
-}
\ No newline at end of file
+}
diff --git a/ggml-sycl.h b/ggml-sycl.h
index b049718add1c9..a2e65b29240f1 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -19,4 +19,4 @@ GGML_API void   ggml_backend_sycl_print_sycl_devices();
 
 #ifdef  __cplusplus
 }
-#endif
\ No newline at end of file
+#endif

From 5600118221277dd4bc8d5da16616f7fc047c4136 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 24 Jan 2024 21:29:21 +0800
Subject: [PATCH 73/90] fix src1->type==F16 bug.

---
 ggml-sycl.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 51ceeade744ef..6a63ddab7759a 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -12097,10 +12097,14 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
         src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
 
     if (src1_convert_f16) {
-        src1_dfloat = src1_dfloat_a.alloc(ne00);
-        ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
-                              ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1,
-                              sizeof(sycl::half), 0, 0, stream);
+        if (src1->type == GGML_TYPE_F16) {
+            src1_dfloat = (sycl::half *)src1->data + row_low * src1_ncols;
+        } else {
+            src1_dfloat = src1_dfloat_a.alloc(ne00);
+            ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
+                                  ne00, ne00, 1, sizeof(float), 0, 0, ne00, 1,
+                                  sizeof(sycl::half), 0, 0, stream);
+        }
     }
 #else
     const dfloat * src1_dfloat = (const dfloat *) src1_ddf_i; // dfloat == float, no conversion

From eef5faae182f5ab1fa21169bbc5f4579aae6b862 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 24 Jan 2024 21:46:48 +0800
Subject: [PATCH 74/90] pass batch offset for F16 src1

---
 ggml-sycl.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 6a63ddab7759a..91feb53984493 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -12098,7 +12098,7 @@ inline void ggml_sycl_op_dequantize_mul_mat_vec(
 
     if (src1_convert_f16) {
         if (src1->type == GGML_TYPE_F16) {
-            src1_dfloat = (sycl::half *)src1->data + row_low * src1_ncols;
+            src1_dfloat = (sycl::half *)src1->data + src1_padded_row_size;
         } else {
             src1_dfloat = src1_dfloat_a.alloc(ne00);
             ggml_cpy_f32_f16_sycl((const char *)src1_ddf_i, (char *)src1_dfloat,
@@ -12729,7 +12729,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
     const bool src0_is_contiguous = ggml_is_contiguous(src0);
     const bool src1_is_contiguous = ggml_is_contiguous(src1);
 
-    const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
+    int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
 
     const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
     GGML_ASSERT(!(split && ne02 > 1));
@@ -12919,7 +12919,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
                 if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) {
                     SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream));
                 }
-
+                if (src1->type == GGML_TYPE_F16) {
+                    src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10;
+                }
                 // do the computation
                 op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i,
                    row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream);

From 5bb93d41b75f66ac46365447c47ad26776dd8861 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 24 Jan 2024 21:56:47 +0800
Subject: [PATCH 75/90] fix batch error

---
 ggml-sycl.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 91feb53984493..c83568098f6ef 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -12206,9 +12206,10 @@ inline void ggml_sycl_op_mul_mat_sycl(
             src0_as_f16.alloc(ne);
             to_fp16_sycl(src0_dd_i, src0_as_f16.get(), ne, stream);
         }
-        const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
-                                         ? (const sycl::half *)src0_dd_i
-                                         : src0_as_f16.get();
+        const sycl::half *src1_ptr =
+            src1->type == GGML_TYPE_F16
+                ? (const sycl::half *)src1->data + src1_padded_row_size
+                : src1_as_f16.get();
 
         sycl_pool_alloc<sycl::half> src1_as_f16;
         if (src1->type != GGML_TYPE_F16) {

From 0635f844c984b75c50a4e65e1c7db7146c44759d Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Wed, 24 Jan 2024 22:00:24 +0800
Subject: [PATCH 76/90] fix wrong code

---
 ggml-sycl.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index c83568098f6ef..6a0d12d63c144 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -12206,10 +12206,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
             src0_as_f16.alloc(ne);
             to_fp16_sycl(src0_dd_i, src0_as_f16.get(), ne, stream);
         }
-        const sycl::half *src1_ptr =
-            src1->type == GGML_TYPE_F16
-                ? (const sycl::half *)src1->data + src1_padded_row_size
-                : src1_as_f16.get();
+        const sycl::half *src0_ptr = src0->type == GGML_TYPE_F16
+                                         ? (const sycl::half *)src0_dd_i
+                                         : src0_as_f16.get();
 
         sycl_pool_alloc<sycl::half> src1_as_f16;
         if (src1->type != GGML_TYPE_F16) {
@@ -12220,7 +12219,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
             to_fp16_sycl(src1_ddf_i, src1_as_f16.get(), ne, stream);
         }
         const sycl::half *src1_ptr = src1->type == GGML_TYPE_F16
-                                         ? (const sycl::half *)src1_ddf_i
+                ? (const sycl::half *)src1->data + src1_padded_row_size
                                          : src1_as_f16.get();
         sycl_pool_alloc<sycl::half> dst_f16(row_diff * src1_ncols);
 

From f1bab5010003d44ae7388c04921a76e6a2d335c6 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Thu, 25 Jan 2024 09:21:16 +0000
Subject: [PATCH 77/90] revert sycl checking in test-sampling

---
 tests/test-sampling.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index e9dd6af1b7454..9427fc09c2efa 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -8,9 +8,7 @@
 #include <cmath>
 #include <numeric>
 
-#ifndef GGML_USE_SYCL
 #include <cassert>
-#endif
 
 #include <vector>
 #include <algorithm>

From 66e24c246823d48e6af6da7a7a877063df194d0b Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Thu, 25 Jan 2024 09:26:02 +0000
Subject: [PATCH 78/90] pass void as arguments of
 ggml_backend_sycl_print_sycl_devices

---
 ggml-sycl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml-sycl.h b/ggml-sycl.h
index a2e65b29240f1..629159dd20159 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -15,7 +15,7 @@ GGML_API bool   ggml_sycl_compute_forward(struct ggml_compute_params * params, s
 GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
 GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
-GGML_API void   ggml_backend_sycl_print_sycl_devices();
+GGML_API void   ggml_backend_sycl_print_sycl_devices(void);
 
 #ifdef  __cplusplus
 }

From b06dca678b251c94e05aedffa8aecc69c2bea388 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Thu, 25 Jan 2024 09:38:36 +0000
Subject: [PATCH 79/90] remove extra blank line in test-sampling

---
 tests/test-sampling.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test-sampling.cpp b/tests/test-sampling.cpp
index 9427fc09c2efa..32e58941c0ee0 100644
--- a/tests/test-sampling.cpp
+++ b/tests/test-sampling.cpp
@@ -7,9 +7,7 @@
 
 #include <cmath>
 #include <numeric>
-
 #include <cassert>
-
 #include <vector>
 #include <algorithm>
 

From 05b7f9be6f24104071344f1ff8c73273c84430c9 Mon Sep 17 00:00:00 2001
From: "Meng, Hengyu" <hengyu.meng@intel.com>
Date: Thu, 25 Jan 2024 09:44:19 +0000
Subject: [PATCH 80/90] revert setting n_threads in sycl

---
 llama.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 5f27088f7bfc8..1593fb50da61a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6707,10 +6707,6 @@ static int llama_decode_internal(
         n_threads = 1;
     }
 
-#ifdef GGML_USE_SYCL
-    n_threads = 1;
-#endif
-
 #ifdef GGML_USE_MPI
     const int64_t n_layer = hparams.n_layer;
     ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);

From d6a6505018fa94fc82c4475053c50abb5b842b24 Mon Sep 17 00:00:00 2001
From: luoyu-intel <yu.luo@intel.com>
Date: Fri, 26 Jan 2024 18:31:04 +0800
Subject: [PATCH 81/90] implement std::isinf for icpx with fast math.

---
 tests/test-backend-ops.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 55ce14e0d902c..fc58c2d338861 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -240,10 +240,17 @@ static std::string var_to_str(ggml_type type) {
 #define VARS_TO_STR10(a, b, c, d, e, f, g, h, i, j) VAR_TO_STR(a) + "," + VARS_TO_STR9(b, c, d, e, f, g, h, i, j)
 #define VARS_TO_STR11(a, b, c, d, e, f, g, h, i, j, k) VAR_TO_STR(a) + "," + VARS_TO_STR10(b, c, d, e, f, g, h, i, j, k)
 
+#ifdef GGML_USE_SYCL
+static bool inline _isinf(float f) {
+    return (*(uint32_t *)&f & 0x7fffffff) == 0x7f800000;
+}
+#else
+static bool inline _isinf(float f) { return std::isinf(f); }
+#endif
 
 // accept FLT_MAX as infinity
 static bool isinf_or_max(float f) {
-    return std::isinf(f) || f == FLT_MAX || f == -FLT_MAX;
+    return _isinf(f) || f == FLT_MAX || f == -FLT_MAX;
 }
 
 static bool ggml_is_view_op(enum ggml_op op) {

From 174c9a0ed684285654130e661922bafbc04345de Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Fri, 26 Jan 2024 20:40:50 +0530
Subject: [PATCH 82/90] Update ci/run.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 ci/run.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ci/run.sh b/ci/run.sh
index a98f5a965ca6a..7584f651fffc7 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -45,8 +45,8 @@ fi
 
 if [ ! -z ${GG_BUILD_SYCL} ]; then
     if [ -z ${ONEAPI_ROOT} ]; then
-      echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
-      exit 1
+        echo "Not detected ONEAPI_ROOT, please install oneAPI base toolkit and enable it by:\n source /opt/intel/oneapi/setvars.sh"
+        exit 1
     fi
 
     CMAKE_EXTRA="${CMAKE_EXTRA} -DLLAMA_SYCL=1 DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON"

From c08fec2a38233ff36ff020856abcf7a6c550d9ee Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Fri, 26 Jan 2024 20:43:17 +0530
Subject: [PATCH 83/90] Update examples/sycl/run-llama2.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/sycl/run-llama2.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index 045e01a9435b8..61f589683bc1c 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 
 INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh

From 2cba564b492fc9f90cf23579e595b79869f4ec98 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Fri, 26 Jan 2024 20:48:49 +0530
Subject: [PATCH 84/90] Update examples/sycl/run-llama2.sh

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/sycl/run-llama2.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index 61f589683bc1c..3bdac73771f75 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -4,9 +4,9 @@ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh
 
 if [ $# -gt 0 ]; then
-  export GGML_SYCL_DEVICE=$1
+    export GGML_SYCL_DEVICE=$1
 else
-  export GGML_SYCL_DEVICE=0
+    export GGML_SYCL_DEVICE=0
 fi
 echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE
 #export GGML_SYCL_DEBUG=1

From f7070510660740e5f1bc7dad48e0888bbdc55b85 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Fri, 26 Jan 2024 20:49:07 +0530
Subject: [PATCH 85/90] Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 100ffe2c103b5..9b7796ebdb27c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -464,7 +464,7 @@ endif()
 
 if (LLAMA_SYCL)
     if ( NOT DEFINED ENV{ONEAPI_ROOT})
-    message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
+        message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
     endif()
     #todo: AOT
 

From 45b061807891616d30de0e48a5f60df5de06a73e Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Fri, 26 Jan 2024 20:49:25 +0530
Subject: [PATCH 86/90] Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9b7796ebdb27c..32370f07b4c6b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -516,10 +516,10 @@ function(get_flags CCID CCVER)
         endif()
     elseif (CCID MATCHES "Intel")
         if (NOT LLAMA_SYCL)
-          # enable max optimization level when using Intel compiler
-          set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-          set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
-          add_link_options(-fuse-ld=lld -static-intel)
+            # enable max optimization level when using Intel compiler
+            set(C_FLAGS   -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+            set(CXX_FLAGS -ipo -O3 -static -fp-model=fast -flto -fno-stack-protector)
+            add_link_options(-fuse-ld=lld -static-intel)
         endif()
     endif()
 

From 553175402bd7d43aaad588c0697839044b5455e8 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Fri, 26 Jan 2024 20:49:48 +0530
Subject: [PATCH 87/90] Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 32370f07b4c6b..98ff4a448ac5e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -833,7 +833,7 @@ add_library(ggml OBJECT
             ${GGML_SOURCES_METAL}  ${GGML_HEADERS_METAL}
             ${GGML_SOURCES_MPI}    ${GGML_HEADERS_MPI}
             ${GGML_SOURCES_EXTRA}  ${GGML_HEADERS_EXTRA}
-            ${GGML_SOURCES_SYCL}  ${GGML_HEADERS_SYCL}
+            ${GGML_SOURCES_SYCL}   ${GGML_HEADERS_SYCL}
             )
 
 target_include_directories(ggml PUBLIC . ${LLAMA_EXTRA_INCLUDES})

From b9ffaab1886f0e32c3ceb62565cb0769c8eaa767 Mon Sep 17 00:00:00 2001
From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com>
Date: Fri, 26 Jan 2024 20:50:17 +0530
Subject: [PATCH 88/90] Update CMakeLists.txt

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 CMakeLists.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 98ff4a448ac5e..4a755b7b42bef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -485,11 +485,8 @@ if (LLAMA_SYCL)
     set(GGML_SOURCES_SYCL ggml-sycl.cpp)
 
     set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
-
 endif()
 
-
-
 function(get_flags CCID CCVER)
     set(C_FLAGS "")
     set(CXX_FLAGS "")

From 2ab97157c98eed08a0f361fe3f847736c96d5fce Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Fri, 26 Jan 2024 23:44:18 +0800
Subject: [PATCH 89/90] add copyright and MIT license declare

---
 examples/sycl/CMakeLists.txt     | 4 ++++
 examples/sycl/build.sh           | 5 +++++
 examples/sycl/ls-sycl-device.cpp | 5 +++++
 examples/sycl/run-llama2.sh      | 4 ++++
 ggml-sycl.cpp                    | 5 +++++
 ggml-sycl.h                      | 5 +++++
 6 files changed, 28 insertions(+)

diff --git a/examples/sycl/CMakeLists.txt b/examples/sycl/CMakeLists.txt
index ad759ae80fea5..69cf8932eb5c7 100644
--- a/examples/sycl/CMakeLists.txt
+++ b/examples/sycl/CMakeLists.txt
@@ -1,3 +1,7 @@
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
 set(TARGET ls-sycl-device)
 add_executable(${TARGET} ls-sycl-device.cpp)
 install(TARGETS ${TARGET} RUNTIME)
diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh
index 5556f3c6c1edb..26ad2f7da754e 100755
--- a/examples/sycl/build.sh
+++ b/examples/sycl/build.sh
@@ -1,3 +1,8 @@
+
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
 mkdir -p build
 cd build
 source /opt/intel/oneapi/setvars.sh
diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp
index 2f9d8f78e61ad..42847154aa3ea 100644
--- a/examples/sycl/ls-sycl-device.cpp
+++ b/examples/sycl/ls-sycl-device.cpp
@@ -1,3 +1,8 @@
+/*MIT license
+  Copyright (C) 2024 Intel Corporation
+  SPDX-License-Identifier: MIT
+*/
+
 #include "ggml-sycl.h"
 
 int main(int argc, char ** argv) {
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
index 3bdac73771f75..f5f4c1e980de4 100755
--- a/examples/sycl/run-llama2.sh
+++ b/examples/sycl/run-llama2.sh
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+#  MIT license
+#  Copyright (C) 2024 Intel Corporation
+#  SPDX-License-Identifier: MIT
+
 INPUT2="Building a website can be done in 10 simple steps:\nStep 1:"
 source /opt/intel/oneapi/setvars.sh
 
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
index 6a0d12d63c144..9764d9c351516 100644
--- a/ggml-sycl.cpp
+++ b/ggml-sycl.cpp
@@ -1,3 +1,8 @@
+/*MIT license
+  Copyright (C) 2024 Intel Corporation
+  SPDX-License-Identifier: MIT
+*/
+
 #include <algorithm>
 #include <assert.h>
 #include <atomic>
diff --git a/ggml-sycl.h b/ggml-sycl.h
index 629159dd20159..0eabb53cc2377 100644
--- a/ggml-sycl.h
+++ b/ggml-sycl.h
@@ -1,3 +1,8 @@
+/*MIT license
+  Copyright (C) 2024 Intel Corporation
+  SPDX-License-Identifier: MIT
+*/
+
 #pragma once
 
 #include "ggml.h"

From d394ca7f3c3d8dd05c199bf33d11457c97a0e781 Mon Sep 17 00:00:00 2001
From: jianyuzh <jianyu.zhang@intel.com>
Date: Sun, 28 Jan 2024 09:15:57 +0800
Subject: [PATCH 90/90] update the cmd example

---
 README_sycl.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README_sycl.md b/README_sycl.md
index f057f06ef1b27..d5a1818f5a268 100644
--- a/README_sycl.md
+++ b/README_sycl.md
@@ -190,7 +190,7 @@ found 4 SYCL devices:
 Set device ID = 0 by **GGML_SYCL_DEVICE=0**
 
 ```
-GGML_SYCL_DEVICE=0 && ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
+GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33
 ```
 or run by script: